From e28c5d588cb9fc98c8b7a7ceb3a094eb478a0ca4 Mon Sep 17 00:00:00 2001
From: Sebastian Ramacher <sramacher@debian.org>
Date: Mon, 9 Sep 2019 20:28:47 +0200
Subject: [PATCH] Import libsoxr_0.1.3.orig.tar.xz

[dgit import orig libsoxr_0.1.3.orig.tar.xz]
---
 AUTHORS                                |    1 +
 CMakeLists.txt                         |  335 ++++
 COPYING.LGPL                           |  502 ++++++
 INSTALL                                |  183 +++
 LICENCE                                |   23 +
 NEWS                                   |   46 +
 README                                 |   53 +
 TODO                                   |    3 +
 cmake/Modules/FindCFlags.cmake         |   35 +
 cmake/Modules/FindLibAVCodec.cmake     |   23 +
 cmake/Modules/FindLibAVUtil.cmake      |   23 +
 cmake/Modules/FindSIMD32.cmake         |   54 +
 cmake/Modules/FindSIMD64.cmake         |   29 +
 cmake/Modules/SetSystemProcessor.cmake |   37 +
 deinstall.cmake.in                     |   25 +
 examples/1-single-block.c              |   50 +
 examples/1a-lsr.c                      |   40 +
 examples/2-stream.C                    |   78 +
 examples/3-options-input-fn.c          |  114 ++
 examples/4-split-channels.c            |  161 ++
 examples/5-variable-rate.c             |   93 ++
 examples/CMakeLists.txt                |   36 +
 examples/README                        |   20 +
 examples/examples-common.h             |   47 +
 go                                     |   18 +
 go.bat                                 |   27 +
 inst-check                             |   25 +
 inst-check-soxr                        |   52 +
 inst-check-soxr-lsr                    |    1 +
 msvc/README                            |   22 +
 msvc/example1.vcproj                   |   82 +
 msvc/libsoxr.sln                       |   29 +
 msvc/libsoxr.vcproj                    |   97 ++
 msvc/soxr-config.h                     |   30 +
 multi-arch                             |   31 +
 soxr-config.h.in                       |   27 +
 src/CMakeLists.txt                     |  129 ++
 src/aliases.h                          |   39 +
 src/avfft32.c                          |   33 +
 src/avfft32s.c                         |   32 +
 src/ccrw2.h                            |   75 +
 src/cr-core.c                          |  314 ++++
 src/cr.c                               |  588 +++++++
 src/cr.h                               |  178 +++
 src/cr32.c                             |    8 +
 src/cr32s.c                            |    8 +
 src/cr64.c                             |    8 +
 src/cr64s.c                            |    8 +
 src/data-io.c                          |  223 +++
 src/data-io.h                          |   39 +
 src/dbesi0.c                           |  149 ++
 src/dev32s.h                           |   54 +
 src/dev64s.h                           |   42 +
 src/fft4g.c                            | 1346 ++++++++++++++++
 src/fft4g.h                            |   23 +
 src/fft4g32.c                          |   36 +
 src/fft4g32s.c                         |   31 +
 src/fft4g64.c                          |   35 +
 src/fft4g_cache.h                      |   92 ++
 src/fifo.h                             |  125 ++
 src/filter.c                           |  277 ++++
 src/filter.h                           |   44 +
 src/half-coefs.h                       |   75 +
 src/half-fir.h                         |   61 +
 src/internal.h                         |   84 +
 src/math-wrap.h                        |   31 +
 src/pffft-avx.h                        |   40 +
 src/pffft-wrap.c                       |  110 ++
 src/pffft.c                            | 1946 ++++++++++++++++++++++++
 src/pffft.h                            |  197 +++
 src/pffft32.c                          |   39 +
 src/pffft32s.c                         |   34 +
 src/pffft64s.c                         |   34 +
 src/poly-fir.h                         |  150 ++
 src/poly-fir0.h                        |   56 +
 src/rdft.h                             |   31 +
 src/rdft_t.h                           |   24 +
 src/rint-clip.h                        |  158 ++
 src/rint.h                             |  102 ++
 src/samplerate.h                       |    1 +
 src/soxr-lsr.c                         |  198 +++
 src/soxr-lsr.h                         |   78 +
 src/soxr-lsr.pc.in                     |    5 +
 src/soxr.c                             |  842 ++++++++++
 src/soxr.h                             |  344 +++++
 src/soxr.pc.in                         |    5 +
 src/std-types.h                        |   48 +
 src/util-simd.c                        |   89 ++
 src/util32s.c                          |    8 +
 src/util32s.h                          |   23 +
 src/util64s.c                          |    8 +
 src/util64s.h                          |   23 +
 src/vr-coefs.c                         |  115 ++
 src/vr-coefs.h                         |   94 ++
 src/vr32.c                             |  651 ++++++++
 tests/1-delay-clear.c                  |   64 +
 tests/CMakeLists.txt                   |   62 +
 tests/README                           |    1 +
 tests/bandwidth-test                   |   41 +
 tests/cmp-test.cmake                   |   30 +
 tests/eg-test                          |   48 +
 tests/io-test                          |   65 +
 tests/large-ratio-test                 |   22 +
 tests/phase-test                       |   39 +
 tests/q-test                           |   73 +
 tests/scripts                          |   14 +
 tests/throughput-test                  |   11 +
 tests/throughput-test.bat              |    5 +
 tests/throughput.c                     |  141 ++
 tests/time-test                        |   36 +
 tests/vector-cmp.c                     |   56 +
 tests/vector-gen.c                     |   61 +
 112 files changed, 12861 insertions(+)
 create mode 100644 AUTHORS
 create mode 100644 CMakeLists.txt
 create mode 100644 COPYING.LGPL
 create mode 100644 INSTALL
 create mode 100644 LICENCE
 create mode 100644 NEWS
 create mode 100644 README
 create mode 100644 TODO
 create mode 100644 cmake/Modules/FindCFlags.cmake
 create mode 100644 cmake/Modules/FindLibAVCodec.cmake
 create mode 100644 cmake/Modules/FindLibAVUtil.cmake
 create mode 100644 cmake/Modules/FindSIMD32.cmake
 create mode 100644 cmake/Modules/FindSIMD64.cmake
 create mode 100644 cmake/Modules/SetSystemProcessor.cmake
 create mode 100644 deinstall.cmake.in
 create mode 100644 examples/1-single-block.c
 create mode 100644 examples/1a-lsr.c
 create mode 100644 examples/2-stream.C
 create mode 100644 examples/3-options-input-fn.c
 create mode 100644 examples/4-split-channels.c
 create mode 100644 examples/5-variable-rate.c
 create mode 100644 examples/CMakeLists.txt
 create mode 100644 examples/README
 create mode 100644 examples/examples-common.h
 create mode 100755 go
 create mode 100644 go.bat
 create mode 100755 inst-check
 create mode 100755 inst-check-soxr
 create mode 120000 inst-check-soxr-lsr
 create mode 100644 msvc/README
 create mode 100644 msvc/example1.vcproj
 create mode 100644 msvc/libsoxr.sln
 create mode 100644 msvc/libsoxr.vcproj
 create mode 100644 msvc/soxr-config.h
 create mode 100755 multi-arch
 create mode 100644 soxr-config.h.in
 create mode 100644 src/CMakeLists.txt
 create mode 100644 src/aliases.h
 create mode 100644 src/avfft32.c
 create mode 100644 src/avfft32s.c
 create mode 100644 src/ccrw2.h
 create mode 100644 src/cr-core.c
 create mode 100644 src/cr.c
 create mode 100644 src/cr.h
 create mode 100644 src/cr32.c
 create mode 100644 src/cr32s.c
 create mode 100644 src/cr64.c
 create mode 100644 src/cr64s.c
 create mode 100644 src/data-io.c
 create mode 100644 src/data-io.h
 create mode 100644 src/dbesi0.c
 create mode 100644 src/dev32s.h
 create mode 100644 src/dev64s.h
 create mode 100644 src/fft4g.c
 create mode 100644 src/fft4g.h
 create mode 100644 src/fft4g32.c
 create mode 100644 src/fft4g32s.c
 create mode 100644 src/fft4g64.c
 create mode 100644 src/fft4g_cache.h
 create mode 100644 src/fifo.h
 create mode 100644 src/filter.c
 create mode 100644 src/filter.h
 create mode 100644 src/half-coefs.h
 create mode 100644 src/half-fir.h
 create mode 100644 src/internal.h
 create mode 100644 src/math-wrap.h
 create mode 100644 src/pffft-avx.h
 create mode 100644 src/pffft-wrap.c
 create mode 100644 src/pffft.c
 create mode 100644 src/pffft.h
 create mode 100644 src/pffft32.c
 create mode 100644 src/pffft32s.c
 create mode 100644 src/pffft64s.c
 create mode 100644 src/poly-fir.h
 create mode 100644 src/poly-fir0.h
 create mode 100644 src/rdft.h
 create mode 100644 src/rdft_t.h
 create mode 100644 src/rint-clip.h
 create mode 100644 src/rint.h
 create mode 100644 src/samplerate.h
 create mode 100644 src/soxr-lsr.c
 create mode 100644 src/soxr-lsr.h
 create mode 100644 src/soxr-lsr.pc.in
 create mode 100644 src/soxr.c
 create mode 100644 src/soxr.h
 create mode 100644 src/soxr.pc.in
 create mode 100644 src/std-types.h
 create mode 100644 src/util-simd.c
 create mode 100644 src/util32s.c
 create mode 100644 src/util32s.h
 create mode 100644 src/util64s.c
 create mode 100644 src/util64s.h
 create mode 100644 src/vr-coefs.c
 create mode 100644 src/vr-coefs.h
 create mode 100644 src/vr32.c
 create mode 100644 tests/1-delay-clear.c
 create mode 100644 tests/CMakeLists.txt
 create mode 100644 tests/README
 create mode 100755 tests/bandwidth-test
 create mode 100644 tests/cmp-test.cmake
 create mode 100755 tests/eg-test
 create mode 100755 tests/io-test
 create mode 100755 tests/large-ratio-test
 create mode 100755 tests/phase-test
 create mode 100755 tests/q-test
 create mode 100755 tests/scripts
 create mode 100755 tests/throughput-test
 create mode 100644 tests/throughput-test.bat
 create mode 100644 tests/throughput.c
 create mode 100755 tests/time-test
 create mode 100644 tests/vector-cmp.c
 create mode 100644 tests/vector-gen.c

diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 0000000..2ba76d3
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1 @@
+Rob Sykes <robs@users.sourceforge.net>
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 0000000..ee48f6c
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,335 @@
+# SoX Resampler Library       Copyright (c) 2007-18 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+cmake_minimum_required (VERSION 3.1 FATAL_ERROR)
+
+project (soxr C)
+set (DESCRIPTION_SUMMARY
+    "High quality, one-dimensional sample-rate conversion library")
+
+
+
+# Release versioning:
+
+set (PROJECT_VERSION_MAJOR 0)
+set (PROJECT_VERSION_MINOR 1)
+set (PROJECT_VERSION_PATCH 3)
+
+# For shared-object; if, since the last public release:
+#   1) library code changed at all: ++revision
+#   2) interfaces changed at all:   ++current, revision = 0
+#   3) interfaces added:            ++age
+#   4) interfaces removed:          age = 0
+
+set (SO_VERSION_CURRENT  1)
+set (SO_VERSION_REVISION 2)
+set (SO_VERSION_AGE      1)
+
+math (EXPR SO_VERSION_MAJOR "${SO_VERSION_CURRENT} - ${SO_VERSION_AGE}")
+math (EXPR SO_VERSION_MINOR "${SO_VERSION_AGE}")
+math (EXPR SO_VERSION_PATCH "${SO_VERSION_REVISION}")
+
+
+
+# Main options:
+
+include (CMakeDependentOption)
+
+if (NOT CMAKE_BUILD_TYPE)
+  set (CMAKE_BUILD_TYPE Release CACHE STRING
+    "Build type, one of: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
+endif ()
+
+option (BUILD_TESTS "Build sanity-tests." ON)
+option (BUILD_EXAMPLES "Build examples." OFF)
+option (WITH_OPENMP "Include OpenMP threading." ON)
+option (WITH_LSR_BINDINGS "Include a `libsamplerate'-like interface." ON)
+
+cmake_dependent_option (BUILD_SHARED_LIBS
+  "Build shared (dynamic) soxr libraries." ON
+  "NOT WITH_DEV_GPROF" OFF)
+cmake_dependent_option (WITH_VR32
+  "Include HQ variable-rate resampling engine." ON
+  "WITH_CR32 OR WITH_CR64 OR WITH_CR32S OR WITH_CR64S OR NOT DEFINED WITH_VR32" ON)
+cmake_dependent_option (WITH_CR32
+  "Include HQ constant-rate resampling engine." ON
+  "WITH_VR32 OR WITH_CR64 OR WITH_CR32S OR WITH_CR64S" ON)
+cmake_dependent_option (WITH_CR64
+  "Include VHQ constant-rate resampling engine." ON
+  "WITH_VR32 OR WITH_CR32 OR WITH_CR32S OR WITH_CR64S" ON)
+cmake_dependent_option (WITH_CR64S
+  "Include VHQ SIMD constant-rate resampling engine." ON
+  "WITH_VR32 OR WITH_CR32 OR WITH_CR32S OR WITH_CR64" ON)
+cmake_dependent_option (WITH_CR32S
+  "Include HQ SIMD constant-rate resampling engine." ON
+  "WITH_VR32 OR WITH_CR64 OR WITH_CR32 OR WITH_CR64S" ON)
+cmake_dependent_option (WITH_PFFFT
+  "Use PFFFT (BSD-like licence) for HQ SIMD DFT." ON
+  "WITH_CR32S;NOT WITH_AVFFT" OFF)
+cmake_dependent_option (WITH_AVFFT
+  "Use libavcodec (LGPL) for HQ SIMD DFT." OFF
+  "WITH_CR32S;NOT WITH_PFFFT" OFF)
+cmake_dependent_option (BUILD_LSR_TESTS "Build LSR tests." OFF
+  "UNIX;NOT CMAKE_CROSSCOMPILING;EXISTS ${PROJECT_SOURCE_DIR}/lsr-tests;WITH_LSR_BINDINGS" OFF)
+
+option (WITH_HI_PREC_CLOCK "Enable high-precision time-base." ON)
+option (WITH_FLOAT_STD_PREC_CLOCK
+  "Use floating-point for standard-precision time-base." OFF)
+option (WITH_DEV_TRACE "Enable developer trace capability." ON)
+option (WITH_DEV_GPROF "Enable developer grpof output." OFF)
+mark_as_advanced (WITH_HI_PREC_CLOCK WITH_FLOAT_STD_PREC_CLOCK
+  WITH_DEV_TRACE WITH_DEV_GPROF)
+
+
+
+# Introspection:
+
+list (APPEND CMAKE_MODULE_PATH ${CMAKE_SOURCE_DIR}/cmake/Modules)
+
+include (CheckFunctionExists)
+include (CheckIncludeFiles)
+include (CheckLibraryExists)
+include (SetSystemProcessor)
+include (TestBigEndian)
+
+set_system_processor ()
+
+check_library_exists (m pow "" NEED_LIBM)
+if (NEED_LIBM)
+  set (CMAKE_REQUIRED_LIBRARIES "m;${CMAKE_REQUIRED_LIBRARIES}")
+  set (LIBM_LIBRARIES m)
+endif ()
+
+if (${BUILD_EXAMPLES})
+  project (${PROJECT_NAME}) # Adds c++ compiler
+endif ()
+
+if (WITH_OPENMP)
+  find_package (OpenMP)
+  if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    if (MINGW) # Is this still needed?
+      set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+      set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} ${OpenMP_C_FLAGS}")
+    endif ()
+  endif()
+endif ()
+
+if (WITH_CR32S)
+  find_package (SIMD32)
+  set (WITH_CR32S ${SIMD32_FOUND})
+endif ()
+
+if (WITH_CR64S)
+  find_package (SIMD64)
+  set (WITH_CR64S ${SIMD64_FOUND})
+endif ()
+
+if (WITH_AVFFT)
+  find_package (LibAVCodec REQUIRED)
+  if (AVCODEC_FOUND)
+    include_directories (${AVCODEC_INCLUDE_DIRS})
+    set (LIBS ${LIBS} ${AVCODEC_LIBRARIES})
+  endif ()
+endif ()
+
+if (WITH_AVFFT OR (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm" AND SIMD32_FOUND AND WITH_CR32))
+  find_package (LibAVUtil)
+  if (AVUTIL_FOUND)
+    include_directories (${AVUTIL_INCLUDE_DIRS})
+    set (LIBS ${LIBS} ${AVUTIL_LIBRARIES})
+  endif ()
+endif ()
+
+check_function_exists (lrint HAVE_LRINT)
+check_include_files (fenv.h HAVE_FENV_H)
+check_include_files (stdbool.h HAVE_STDBOOL_H)
+check_include_files (stdint.h HAVE_STDINT_H)
+test_big_endian (HAVE_BIGENDIAN)
+
+
+
+# Compiler configuration:
+
+if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -Wconversion -Wall -Wextra \
+      -pedantic -Wundef -Wpointer-arith -Wno-long-long")
+  if (CMAKE_C_COMPILER_ID STREQUAL "Clang")
+    set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -Wno-keyword-macro")
+  endif ()
+  if (WITH_DEV_GPROF)
+    set (PROJECT_CXX_FLAGS "${PROJECT_CXX_FLAGS} -pg")
+  endif ()
+  # Can use std=c89, but gnu89 should give faster sinf, cosf, etc.:
+  set (PROJECT_C_FLAGS "${PROJECT_CXX_FLAGS} \
+       -std=gnu89 -Wnested-externs -Wmissing-prototypes -Wstrict-prototypes")
+  if (CMAKE_BUILD_TYPE STREQUAL "Release")
+    set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -s") # strip
+  endif ()
+  cmake_dependent_option (VISIBILITY_HIDDEN
+    "Build shared libraries with -fvisibility=hidden." ON
+    "BUILD_SHARED_LIBS" OFF)
+  mark_as_advanced (VISIBILITY_HIDDEN)
+  if (VISIBILITY_HIDDEN)
+    add_definitions (-fvisibility=hidden -DSOXR_VISIBILITY)
+  endif ()
+endif ()
+
+if (MSVC)
+  add_definitions (-D_USE_MATH_DEFINES -D_CRT_SECURE_NO_WARNINGS)
+  option (BUILD_SHARED_RUNTIME "MSVC, link with runtime dynamically."  ON)
+  if (NOT BUILD_SHARED_RUNTIME)
+    foreach (flag_var
+        CMAKE_C_FLAGS                CMAKE_CXX_FLAGS
+        CMAKE_C_FLAGS_DEBUG          CMAKE_CXX_FLAGS_DEBUG
+        CMAKE_C_FLAGS_RELEASE        CMAKE_CXX_FLAGS_RELEASE
+        CMAKE_C_FLAGS_MINSIZEREL     CMAKE_CXX_FLAGS_MINSIZEREL
+        CMAKE_C_FLAGS_RELWITHDEBINFO CMAKE_CXX_FLAGS_RELWITHDEBINFO)
+      string (REGEX REPLACE "/MD" "/MT" ${flag_var} "${${flag_var}}")
+    endforeach ()
+  endif ()
+  # By default, do not warn when built on machines using only VS Express:
+  if (NOT DEFINED CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_NO_WARNINGS)
+    set (CMAKE_INSTALL_SYSTEM_RUNTIME_LIBS_NO_WARNINGS ON)
+  endif ()
+endif ()
+
+
+
+# Build configuration:
+
+if (${BUILD_SHARED_LIBS} AND ${CMAKE_SYSTEM_NAME} STREQUAL Windows)
+  # Allow exes to find dlls:
+  set (BIN ${PROJECT_BINARY_DIR}/bin/)
+  set (EXAMPLES_BIN ${BIN})
+  set (CMAKE_LIBRARY_OUTPUT_DIRECTORY ${BIN})
+  set (CMAKE_RUNTIME_OUTPUT_DIRECTORY ${BIN})
+else ()
+  set (BIN ./)
+  set (EXAMPLES_BIN ../examples/)
+endif ()
+
+set (LIB_TYPE STATIC)
+if (BUILD_SHARED_LIBS)
+  set (LIB_TYPE SHARED)
+  if (MSVC)
+    add_definitions (-DSOXR_DLL)
+  endif ()
+endif ()
+
+if (CMAKE_BUILD_TYPE STREQUAL "None") # As used by some distros.
+  add_definitions (-DNDEBUG)
+endif ()
+
+
+
+# Installation configuration:
+
+if (NOT DEFINED BIN_INSTALL_DIR)
+  set (BIN_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/bin")
+endif ()
+if (NOT DEFINED LIB_INSTALL_DIR)
+  set (LIB_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/lib${LIB_SUFFIX}")
+endif ()
+if (NOT DEFINED INCLUDE_INSTALL_DIR)
+  set (INCLUDE_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/include")
+endif ()
+if (NOT DEFINED DOC_INSTALL_DIR)
+  if (UNIX)
+    set (DOC_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/share/doc/lib${PROJECT_NAME}")
+  else ()
+    set (DOC_INSTALL_DIR "${CMAKE_INSTALL_PREFIX}/doc")
+  endif ()
+endif ()
+
+if (APPLE)
+  option (BUILD_FRAMEWORK "Build an OS X framework." OFF)
+  set (FRAMEWORK_INSTALL_DIR
+      "/Library/Frameworks" CACHE STRING "Directory to install frameworks to.")
+endif ()
+
+
+
+# Top-level:
+
+set (PROJECT_VERSION
+    ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH})
+set (SO_VERSION ${SO_VERSION_MAJOR}.${SO_VERSION_MINOR}.${SO_VERSION_PATCH})
+
+configure_file (
+  ${PROJECT_SOURCE_DIR}/${PROJECT_NAME}-config.h.in
+  ${PROJECT_BINARY_DIR}/${PROJECT_NAME}-config.h)
+include_directories (${PROJECT_BINARY_DIR})
+
+if (NOT CMAKE_CROSSCOMPILING AND (BUILD_TESTS OR BUILD_LSR_TESTS))
+  enable_testing ()
+endif ()
+
+install (FILES
+  ${CMAKE_CURRENT_SOURCE_DIR}/README
+  ${CMAKE_CURRENT_SOURCE_DIR}/LICENCE
+  ${CMAKE_CURRENT_SOURCE_DIR}/NEWS
+  DESTINATION ${DOC_INSTALL_DIR})
+
+
+
+# Subdirectories:
+
+include_directories (${PROJECT_SOURCE_DIR}/src)
+
+add_subdirectory (src)
+if (BUILD_TESTS)
+  add_subdirectory (tests)
+endif ()
+if (BUILD_LSR_TESTS)
+  add_subdirectory (lsr-tests)
+endif ()
+if (BUILD_EXAMPLES OR BUILD_TESTS)
+  add_subdirectory (examples)
+endif ()
+
+
+
+# GNU Autotools compatibility; 'make check':
+
+add_custom_target (check COMMAND ${CMAKE_CTEST_COMMAND})
+
+
+
+# GNU Autotools compatibility; 'make distclean':
+
+if (UNIX)
+  add_custom_target (distclean COMMAND make clean && find .
+      \\! -path \\*/Modules/\\* \\! -name cmp-test.cmake -a -name \\*.cmake
+      -o -name CMakeFiles -o -name Makefile -o -name CMakeCache.txt -o -name
+      Testing -o -name cmake_install.cmake -o -name install_manifest.txt -o
+      -path ./soxr-config.h -o -name config.h -o -name \\*.pc -o -name \\*.s32
+      | xargs rm -rf)
+endif ()
+
+
+
+# Deinstallation:
+
+configure_file (
+  "${CMAKE_CURRENT_SOURCE_DIR}/deinstall.cmake.in"
+  "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake"
+  IMMEDIATE @ONLY)
+
+add_custom_target (deinstall
+  COMMAND ${CMAKE_COMMAND} -P "${CMAKE_CURRENT_BINARY_DIR}/deinstall.cmake")
+
+
+
+# Packaging:
+
+if (UNIX)
+  set (CPACK_PACKAGE_VERSION_MAJOR "${PROJECT_VERSION_MAJOR}")
+  set (CPACK_PACKAGE_VERSION_MINOR "${PROJECT_VERSION_MINOR}")
+  set (CPACK_PACKAGE_VERSION_PATCH "${PROJECT_VERSION_PATCH}")
+  set (CPACK_SOURCE_GENERATOR "TXZ")
+  set (CPACK_SOURCE_IGNORE_FILES
+      "dist;/lsr-tests/;/Debug.*/;/Release.*/;\\\\.swp$;\\\\.git.*;/\\\\.git/")
+  include (CPack)
+endif ()
diff --git a/COPYING.LGPL b/COPYING.LGPL
new file mode 100644
index 0000000..551cb4a
--- /dev/null
+++ b/COPYING.LGPL
@@ -0,0 +1,502 @@
+		  GNU LESSER GENERAL PUBLIC LICENSE
+		       Version 2.1, February 1999
+
+ Copyright (C) 1991, 1999 Free Software Foundation, Inc.
+ 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+[This is the first released version of the Lesser GPL.  It also counts
+ as the successor of the GNU Library Public License, version 2, hence
+ the version number 2.1.]
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+Licenses are intended to guarantee your freedom to share and change
+free software--to make sure the software is free for all its users.
+
+  This license, the Lesser General Public License, applies to some
+specially designated software packages--typically libraries--of the
+Free Software Foundation and other authors who decide to use it.  You
+can use it too, but we suggest you first think carefully about whether
+this license or the ordinary General Public License is the better
+strategy to use in any particular case, based on the explanations below.
+
+  When we speak of free software, we are referring to freedom of use,
+not price.  Our General Public Licenses are designed to make sure that
+you have the freedom to distribute copies of free software (and charge
+for this service if you wish); that you receive source code or can get
+it if you want it; that you can change the software and use pieces of
+it in new free programs; and that you are informed that you can do
+these things.
+
+  To protect your rights, we need to make restrictions that forbid
+distributors to deny you these rights or to ask you to surrender these
+rights.  These restrictions translate to certain responsibilities for
+you if you distribute copies of the library or if you modify it.
+
+  For example, if you distribute copies of the library, whether gratis
+or for a fee, you must give the recipients all the rights that we gave
+you.  You must make sure that they, too, receive or can get the source
+code.  If you link other code with the library, you must provide
+complete object files to the recipients, so that they can relink them
+with the library after making changes to the library and recompiling
+it.  And you must show them these terms so they know their rights.
+
+  We protect your rights with a two-step method: (1) we copyright the
+library, and (2) we offer you this license, which gives you legal
+permission to copy, distribute and/or modify the library.
+
+  To protect each distributor, we want to make it very clear that
+there is no warranty for the free library.  Also, if the library is
+modified by someone else and passed on, the recipients should know
+that what they have is not the original version, so that the original
+author's reputation will not be affected by problems that might be
+introduced by others.
+
+  Finally, software patents pose a constant threat to the existence of
+any free program.  We wish to make sure that a company cannot
+effectively restrict the users of a free program by obtaining a
+restrictive license from a patent holder.  Therefore, we insist that
+any patent license obtained for a version of the library must be
+consistent with the full freedom of use specified in this license.
+
+  Most GNU software, including some libraries, is covered by the
+ordinary GNU General Public License.  This license, the GNU Lesser
+General Public License, applies to certain designated libraries, and
+is quite different from the ordinary General Public License.  We use
+this license for certain libraries in order to permit linking those
+libraries into non-free programs.
+
+  When a program is linked with a library, whether statically or using
+a shared library, the combination of the two is legally speaking a
+combined work, a derivative of the original library.  The ordinary
+General Public License therefore permits such linking only if the
+entire combination fits its criteria of freedom.  The Lesser General
+Public License permits more lax criteria for linking other code with
+the library.
+
+  We call this license the "Lesser" General Public License because it
+does Less to protect the user's freedom than the ordinary General
+Public License.  It also provides other free software developers Less
+of an advantage over competing non-free programs.  These disadvantages
+are the reason we use the ordinary General Public License for many
+libraries.  However, the Lesser license provides advantages in certain
+special circumstances.
+
+  For example, on rare occasions, there may be a special need to
+encourage the widest possible use of a certain library, so that it becomes
+a de-facto standard.  To achieve this, non-free programs must be
+allowed to use the library.  A more frequent case is that a free
+library does the same job as widely used non-free libraries.  In this
+case, there is little to gain by limiting the free library to free
+software only, so we use the Lesser General Public License.
+
+  In other cases, permission to use a particular library in non-free
+programs enables a greater number of people to use a large body of
+free software.  For example, permission to use the GNU C Library in
+non-free programs enables many more people to use the whole GNU
+operating system, as well as its variant, the GNU/Linux operating
+system.
+
+  Although the Lesser General Public License is Less protective of the
+users' freedom, it does ensure that the user of a program that is
+linked with the Library has the freedom and the wherewithal to run
+that program using a modified version of the Library.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.  Pay close attention to the difference between a
+"work based on the library" and a "work that uses the library".  The
+former contains code derived from the library, whereas the latter must
+be combined with the library in order to run.
+
+		  GNU LESSER GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License Agreement applies to any software library or other
+program which contains a notice placed by the copyright holder or
+other authorized party saying it may be distributed under the terms of
+this Lesser General Public License (also called "this License").
+Each licensee is addressed as "you".
+
+  A "library" means a collection of software functions and/or data
+prepared so as to be conveniently linked with application programs
+(which use some of those functions and data) to form executables.
+
+  The "Library", below, refers to any such software library or work
+which has been distributed under these terms.  A "work based on the
+Library" means either the Library or any derivative work under
+copyright law: that is to say, a work containing the Library or a
+portion of it, either verbatim or with modifications and/or translated
+straightforwardly into another language.  (Hereinafter, translation is
+included without limitation in the term "modification".)
+
+  "Source code" for a work means the preferred form of the work for
+making modifications to it.  For a library, complete source code means
+all the source code for all modules it contains, plus any associated
+interface definition files, plus the scripts used to control compilation
+and installation of the library.
+
+  Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running a program using the Library is not restricted, and output from
+such a program is covered only if its contents constitute a work based
+on the Library (independent of the use of the Library in a tool for
+writing it).  Whether that is true depends on what the Library does
+and what the program that uses the Library does.
+
+  1. You may copy and distribute verbatim copies of the Library's
+complete source code as you receive it, in any medium, provided that
+you conspicuously and appropriately publish on each copy an
+appropriate copyright notice and disclaimer of warranty; keep intact
+all the notices that refer to this License and to the absence of any
+warranty; and distribute a copy of this License along with the
+Library.
+
+  You may charge a fee for the physical act of transferring a copy,
+and you may at your option offer warranty protection in exchange for a
+fee.
+
+  2. You may modify your copy or copies of the Library or any portion
+of it, thus forming a work based on the Library, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) The modified work must itself be a software library.
+
+    b) You must cause the files modified to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    c) You must cause the whole of the work to be licensed at no
+    charge to all third parties under the terms of this License.
+
+    d) If a facility in the modified Library refers to a function or a
+    table of data to be supplied by an application program that uses
+    the facility, other than as an argument passed when the facility
+    is invoked, then you must make a good faith effort to ensure that,
+    in the event an application does not supply such function or
+    table, the facility still operates, and performs whatever part of
+    its purpose remains meaningful.
+
+    (For example, a function in a library to compute square roots has
+    a purpose that is entirely well-defined independent of the
+    application.  Therefore, Subsection 2d requires that any
+    application-supplied function or table used by this function must
+    be optional: if the application does not supply it, the square
+    root function must still compute square roots.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Library,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Library, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote
+it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Library.
+
+In addition, mere aggregation of another work not based on the Library
+with the Library (or with a work based on the Library) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may opt to apply the terms of the ordinary GNU General Public
+License instead of this License to a given copy of the Library.  To do
+this, you must alter all the notices that refer to this License, so
+that they refer to the ordinary GNU General Public License, version 2,
+instead of to this License.  (If a newer version than version 2 of the
+ordinary GNU General Public License has appeared, then you can specify
+that version instead if you wish.)  Do not make any other change in
+these notices.
+
+  Once this change is made in a given copy, it is irreversible for
+that copy, so the ordinary GNU General Public License applies to all
+subsequent copies and derivative works made from that copy.
+
+  This option is useful when you wish to copy part of the code of
+the Library into a program that is not a library.
+
+  4. You may copy and distribute the Library (or a portion or
+derivative of it, under Section 2) in object code or executable form
+under the terms of Sections 1 and 2 above provided that you accompany
+it with the complete corresponding machine-readable source code, which
+must be distributed under the terms of Sections 1 and 2 above on a
+medium customarily used for software interchange.
+
+  If distribution of object code is made by offering access to copy
+from a designated place, then offering equivalent access to copy the
+source code from the same place satisfies the requirement to
+distribute the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  5. A program that contains no derivative of any portion of the
+Library, but is designed to work with the Library by being compiled or
+linked with it, is called a "work that uses the Library".  Such a
+work, in isolation, is not a derivative work of the Library, and
+therefore falls outside the scope of this License.
+
+  However, linking a "work that uses the Library" with the Library
+creates an executable that is a derivative of the Library (because it
+contains portions of the Library), rather than a "work that uses the
+library".  The executable is therefore covered by this License.
+Section 6 states terms for distribution of such executables.
+
+  When a "work that uses the Library" uses material from a header file
+that is part of the Library, the object code for the work may be a
+derivative work of the Library even though the source code is not.
+Whether this is true is especially significant if the work can be
+linked without the Library, or if the work is itself a library.  The
+threshold for this to be true is not precisely defined by law.
+
+  If such an object file uses only numerical parameters, data
+structure layouts and accessors, and small macros and small inline
+functions (ten lines or less in length), then the use of the object
+file is unrestricted, regardless of whether it is legally a derivative
+work.  (Executables containing this object code plus portions of the
+Library will still fall under Section 6.)
+
+  Otherwise, if the work is a derivative of the Library, you may
+distribute the object code for the work under the terms of Section 6.
+Any executables containing that work also fall under Section 6,
+whether or not they are linked directly with the Library itself.
+
+  6. As an exception to the Sections above, you may also combine or
+link a "work that uses the Library" with the Library to produce a
+work containing portions of the Library, and distribute that work
+under terms of your choice, provided that the terms permit
+modification of the work for the customer's own use and reverse
+engineering for debugging such modifications.
+
+  You must give prominent notice with each copy of the work that the
+Library is used in it and that the Library and its use are covered by
+this License.  You must supply a copy of this License.  If the work
+during execution displays copyright notices, you must include the
+copyright notice for the Library among them, as well as a reference
+directing the user to the copy of this License.  Also, you must do one
+of these things:
+
+    a) Accompany the work with the complete corresponding
+    machine-readable source code for the Library including whatever
+    changes were used in the work (which must be distributed under
+    Sections 1 and 2 above); and, if the work is an executable linked
+    with the Library, with the complete machine-readable "work that
+    uses the Library", as object code and/or source code, so that the
+    user can modify the Library and then relink to produce a modified
+    executable containing the modified Library.  (It is understood
+    that the user who changes the contents of definitions files in the
+    Library will not necessarily be able to recompile the application
+    to use the modified definitions.)
+
+    b) Use a suitable shared library mechanism for linking with the
+    Library.  A suitable mechanism is one that (1) uses at run time a
+    copy of the library already present on the user's computer system,
+    rather than copying library functions into the executable, and (2)
+    will operate properly with a modified version of the library, if
+    the user installs one, as long as the modified version is
+    interface-compatible with the version that the work was made with.
+
+    c) Accompany the work with a written offer, valid for at
+    least three years, to give the same user the materials
+    specified in Subsection 6a, above, for a charge no more
+    than the cost of performing this distribution.
+
+    d) If distribution of the work is made by offering access to copy
+    from a designated place, offer equivalent access to copy the above
+    specified materials from the same place.
+
+    e) Verify that the user has already received a copy of these
+    materials or that you have already sent this user a copy.
+
+  For an executable, the required form of the "work that uses the
+Library" must include any data and utility programs needed for
+reproducing the executable from it.  However, as a special exception,
+the materials to be distributed need not include anything that is
+normally distributed (in either source or binary form) with the major
+components (compiler, kernel, and so on) of the operating system on
+which the executable runs, unless that component itself accompanies
+the executable.
+
+  It may happen that this requirement contradicts the license
+restrictions of other proprietary libraries that do not normally
+accompany the operating system.  Such a contradiction means you cannot
+use both them and the Library together in an executable that you
+distribute.
+
+  7. You may place library facilities that are a work based on the
+Library side-by-side in a single library together with other library
+facilities not covered by this License, and distribute such a combined
+library, provided that the separate distribution of the work based on
+the Library and of the other library facilities is otherwise
+permitted, and provided that you do these two things:
+
+    a) Accompany the combined library with a copy of the same work
+    based on the Library, uncombined with any other library
+    facilities.  This must be distributed under the terms of the
+    Sections above.
+
+    b) Give prominent notice with the combined library of the fact
+    that part of it is a work based on the Library, and explaining
+    where to find the accompanying uncombined form of the same work.
+
+  8. You may not copy, modify, sublicense, link with, or distribute
+the Library except as expressly provided under this License.  Any
+attempt otherwise to copy, modify, sublicense, link with, or
+distribute the Library is void, and will automatically terminate your
+rights under this License.  However, parties who have received copies,
+or rights, from you under this License will not have their licenses
+terminated so long as such parties remain in full compliance.
+
+  9. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Library or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Library (or any work based on the
+Library), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Library or works based on it.
+
+  10. Each time you redistribute the Library (or any work based on the
+Library), the recipient automatically receives a license from the
+original licensor to copy, distribute, link with or modify the Library
+subject to these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties with
+this License.
+
+  11. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Library at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Library by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Library.
+
+If any portion of this section is held invalid or unenforceable under any
+particular circumstance, the balance of the section is intended to apply,
+and the section as a whole is intended to apply in other circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  12. If the distribution and/or use of the Library is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Library under this License may add
+an explicit geographical distribution limitation excluding those countries,
+so that distribution is permitted only in or among countries not thus
+excluded.  In such case, this License incorporates the limitation as if
+written in the body of this License.
+
+  13. The Free Software Foundation may publish revised and/or new
+versions of the Lesser General Public License from time to time.
+Such new versions will be similar in spirit to the present version,
+but may differ in detail to address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Library
+specifies a version number of this License which applies to it and
+"any later version", you have the option of following the terms and
+conditions either of that version or of any later version published by
+the Free Software Foundation.  If the Library does not specify a
+license version number, you may choose any version ever published by
+the Free Software Foundation.
+
+  14. If you wish to incorporate parts of the Library into other free
+programs whose distribution conditions are incompatible with these,
+write to the author to ask for permission.  For software which is
+copyrighted by the Free Software Foundation, write to the Free
+Software Foundation; we sometimes make exceptions for this.  Our
+decision will be guided by the two goals of preserving the free status
+of all derivatives of our free software and of promoting the sharing
+and reuse of software generally.
+
+			    NO WARRANTY
+
+  15. BECAUSE THE LIBRARY IS LICENSED FREE OF CHARGE, THERE IS NO
+WARRANTY FOR THE LIBRARY, TO THE EXTENT PERMITTED BY APPLICABLE LAW.
+EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR
+OTHER PARTIES PROVIDE THE LIBRARY "AS IS" WITHOUT WARRANTY OF ANY
+KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE.  THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE
+LIBRARY IS WITH YOU.  SHOULD THE LIBRARY PROVE DEFECTIVE, YOU ASSUME
+THE COST OF ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+  16. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN
+WRITING WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY
+AND/OR REDISTRIBUTE THE LIBRARY AS PERMITTED ABOVE, BE LIABLE TO YOU
+FOR DAMAGES, INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR
+CONSEQUENTIAL DAMAGES ARISING OUT OF THE USE OR INABILITY TO USE THE
+LIBRARY (INCLUDING BUT NOT LIMITED TO LOSS OF DATA OR DATA BEING
+RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD PARTIES OR A
+FAILURE OF THE LIBRARY TO OPERATE WITH ANY OTHER SOFTWARE), EVEN IF
+SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF SUCH
+DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+           How to Apply These Terms to Your New Libraries
+
+  If you develop a new library, and you want it to be of the greatest
+possible use to the public, we recommend making it free software that
+everyone can redistribute and change.  You can do so by permitting
+redistribution under these terms (or, alternatively, under the terms of the
+ordinary General Public License).
+
+  To apply these terms, attach the following notices to the library.  It is
+safest to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least the
+"copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the library's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This library is free software; you can redistribute it and/or
+    modify it under the terms of the GNU Lesser General Public
+    License as published by the Free Software Foundation; either
+    version 2.1 of the License, or (at your option) any later version.
+
+    This library is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+    Lesser General Public License for more details.
+
+    You should have received a copy of the GNU Lesser General Public
+    License along with this library; if not, write to the Free Software
+    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+Also add information on how to contact you by electronic and paper mail.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the library, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the
+  library `Frob' (a library for tweaking knobs) written by James Random Hacker.
+
+  <signature of Ty Coon>, 1 April 1990
+  Ty Coon, President of Vice
+
+That's all there is to it!
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..5599870
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,183 @@
+SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+
+INSTALLATION GUIDE CONTENTS
+
+* Standard build
+* Build customisation
+* Cross-compilation
+* Integration with other build systems
+* Run-time configuration
+
+
+
+STANDARD BUILD
+
+1. Prerequisites:
+
+    Before you can build this library, you need to have available on your
+    system:
+
+    * A C-compiler with 64-bit integer support and, optionally, OpenMP, SIMD.
+
+    * A 'make' utility (most compiler installations already have one of these).
+
+    * CMake v3.0 or newer: https://cmake.org/download/
+
+
+2. Build:
+
+    At a command prompt, change directory (`cd') to the one containing this
+    file, then enter:
+
+        go                          (on MS-Windows with nmake)
+    or
+        ./go                        (on Unix-like systems)
+
+    This should build the library and run a few sanity tests.
+
+
+3. Installation:
+
+    Note that this step may need to be performed by a system
+    administrator.  Enter:
+
+        nmake install               (on MS-Windows)
+    or
+        cd Release; make install    (on Unix-like)
+
+
+4. Preparation for use:
+
+    To use the library you may need to set up appropriate paths to the
+    library and its header file in your development environment.
+
+
+5. Installation test
+
+    To test the installation, build and run some of the example programmes
+    (see examples/README).
+
+
+
+BUILD CUSTOMISATION
+
+If it is necessary to customise the build, then steps 2 and 3 above should be
+substituted as follows: change directory to the one containing this file, then
+enter commands along the lines:
+
+    mkdir build
+    cd build
+    cmake -Wno-dev -DCMAKE_BUILD_TYPE=Release [OPTIONS] ..
+    make
+    make test
+    sudo make install
+
+N.B. The CMAKE_BUILD_TYPE to use for library deployment is Release.
+
+To list help on the available options, enter:
+
+    cmake -LH ..
+
+Options, if given, should be preceded with '-D', e.g.
+
+    -DBUILD_SHARED_LIBS:BOOL=OFF
+
+
+
+Resampling engines
+
+As available on a given system, options for including up-to five resampling
+âenginesâ are available (per above) as follows:
+
+    WITH_CR32: for constant-rate HQ resampling,
+    WITH_CR32S: SIMD variant of previous,
+    WITH_CR64: for constant-rate VHQ resampling,
+    WITH_CR64S: SIMD variant of previous,
+    WITH_VR32: for variable-rate HQ resampling.
+
+By default, these options are all set to ON.
+
+When both SIMD and non-SIMD engine variants are included, run-time selection
+is automatic (based on CPU capability) for x86 CPUs, and can be automatic for
+ARM CPUs if the 3rd-party library `libavutil' is available at libsoxr
+build-time.  Which engine has been selected for a specific configuration and
+invocation of the library can be checked using example #3, which reports it.
+See also Run-time Configuration, below.
+
+
+
+CROSS-COMPILATION
+
+E.g. targeting a Linux ARM system:
+
+    mkdir build
+    cd build
+    cmake -DCMAKE_SYSTEM_NAME=Linux \
+          -DCMAKE_C_COMPILER=arm-linux-gnueabi-gcc \
+          ..
+or, also building the examples (one of which uses C++):
+
+    cmake -DCMAKE_SYSTEM_NAME=Linux \
+          -DCMAKE_C_COMPILER=arm-linux-gnueabi-gcc \
+          -DCMAKE_CXX_COMPILER=arm-linux-gnueabi-g++ \
+          -DBUILD_EXAMPLES=1 \
+          ..
+
+E.g. with Mingw (Linux host), using a tool-chain file:
+
+    mkdir build
+    cd build
+    cmake -DCMAKE_TOOLCHAIN_FILE=~/Toolchain-x86_64-mingw-w64-mingw32.cmake \
+          -DCMAKE_INSTALL_PREFIX=install \
+          ..
+    make
+
+where ~/Toolchain-x86_64-mingw-w64-mingw32.cmake might contain:
+
+    SET(CMAKE_SYSTEM_NAME Windows)
+    SET(CMAKE_C_COMPILER /usr/bin/x86_64-w64-mingw32-gcc)
+    SET(CMAKE_CXX_COMPILER /usr/bin/x86_64-w64-mingw32-g++)
+    SET(CMAKE_RC_COMPILER /usr/bin/x86_64-w64-mingw32-windres)
+    SET(CMAKE_Fortran_COMPILER /usr/bin/x86_64-w64-mingw32-gfortran)
+    SET(CMAKE_AR:FILEPATH /usr/bin/x86_64-w64-mingw32-ar)
+    SET(CMAKE_RANLIB:FILEPATH /usr/bin/x86_64-w64-mingw32-ranlib)
+    SET(CMAKE_FIND_ROOT_PATH /usr/x86_64-w64-mingw32)
+    SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+    SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+    SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+    SET(QT_BINARY_DIR /usr/x86_64-w64-mingw32/bin /usr/bin)
+    SET(Boost_COMPILER -gcc47)
+
+
+
+INTEGRATION WITH OTHER BUILD SYSTEMS
+
+Autotools-based systems might find it useful to create a file called
+`configure' in the directory containing this file, consisting of the line:
+    cmake -DBUILD_SHARED_LIBS=OFF .
+(or with other build options as required).
+
+For MS Visual Studio, see msvc/README.
+
+
+
+RUN-TIME CONFIGURATION
+
+The libsoxr API structure âsoxr_runtime_spec_tâ allows application developers
+to optimise some aspects of libsoxrâs operation for a particular application.
+Optimal performance however, might depend on an individual end-userâs run-
+time system and the end-userâs preferences.  Hence environment variables are
+available to set (override) run-time parameters as follows:
+
+    Env. variable        Equivalent soxr_runtime_spec_t item (see soxr.h)
+    ------------------   -----------------------------------
+    SOXR_COEFS_SIZE      coef_size_kbytes
+    SOXR_COEF_INTERP     SOXR_COEF_INTERP_xxx
+    SOXR_LARGE_DFT_SIZE  log2_large_dft_size
+    SOXR_MIN_DFT_SIZE    log2_min_dft_size
+    SOXR_NUM_THREADS     num_threads
+
+Additionally, the SOXR_USE_SIMD32 and SOXR_USE_SIMD64 boolean environment
+variables can be used to override automatic selection (or to provide manual
+selection where automatic selection is not available) between SIMD and
+non-SIMD engine variants.
diff --git a/LICENCE b/LICENCE
new file mode 100644
index 0000000..43e5a71
--- /dev/null
+++ b/LICENCE
@@ -0,0 +1,23 @@
+SoX Resampler Library       Copyright (c) 2007-18 robs@users.sourceforge.net
+
+This library is free software; you can redistribute it and/or modify it
+under the terms of the GNU Lesser General Public License as published by
+the Free Software Foundation; either version 2.1 of the License, or (at
+your option) any later version.
+
+This library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
+General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public License
+along with this library; if not, see <https://www.gnu.org/licenses/>.
+
+
+Notes
+
+1. Re software in the `examples' directory: works that are not resampling
+examples but are based on the given examples -- for example, applications using
+the library -- shall not be considered to be derivative works of the examples.
+
+2. If building with pffft.c, see the licence embedded in that file.
diff --git a/NEWS b/NEWS
new file mode 100644
index 0000000..9e7c298
--- /dev/null
+++ b/NEWS
@@ -0,0 +1,46 @@
+Version 0.1.3 (2018-02-24)
+  * SIMD enhancements: SSE, AVX, Neon.
+  * Improve support for clang, ARM, and cross-compilation.
+  * Provide env. var. override of runtime parameters.
+  * Build fix re cmake variables AVCODEC_INCLUDE_DIRS & AVUTIL_INCLUDE_DIRS.
+  * Build options WITH_SINGLE_PRECISION, WITH_DOUBLE_PRECISION & WITH_SIMD have
+    been removed; replacement options are detailed in INSTALL, `Resampling
+    engines'.
+
+Version 0.1.2 (2015-09-05)
+  * Fix conversion failure when I/O types differ but I/O rates don't.
+  * Fix #defines for interpolation order selection.
+  * Fix ineffectual SOXR_MINIMUM_PHASE and SOXR_INTERMEDIATE_PHASE in
+    soxr_quality_spec recipe.
+  * Fix soxr_delay() returning a negative number after end-of-input has been
+    indicated.
+  * Fix crash when using soxr_process() after calling soxr_clear().
+  * Be more POSIX compliant w.r.t. errno in the examples; fixes erroneous
+    reporting of errors on FreeBSD.
+  * Quality improvement for variable-rate.
+  * Various fixes/improvements to build/tests/documentation.
+
+Version 0.1.1 (2013-03-03)
+  * Minor fixes/improvements to build/tests.
+  * Fix crash (e.g. with k3b) when null error pointer passed to src_create (lsr
+    bindings only).
+  * Fix broken resampling in many cases with SIMD and anti_aliasing_pc < 100.
+  * For clarity, renamed and slightly changed usage of three parameters in
+    soxr_quality_spec_t (ABI compatible, API incompatible).  An application not
+    setting these parameters directly need make no change; otherwise, changes
+    should be made per the following example (as shown, compatibility with both
+    old/new APIs is maintained).  See also the comments on these parameters in
+    soxr.h.  N.B. ABI compatibility with the 0.1.0 API may be removed in a
+    future release.
+      #if !defined SOXR_VERSION /* Deprecated, 0.1.0 API */
+        q_spec.phase = minimum_phase? 0 : 50;
+        q_spec.bw_pc = cutoff * 100;
+        q_spec.anti_aliasing_pc = anti_aliasing * 100;
+      #else /* 0.1.1 API */                            Explanation:
+        q_spec.phase_response = minimum_phase? 0 : 50;  Renamed.
+        q_spec.passband_end = cutoff;                   Renamed, no longer %.
+        q_spec.stopband_begin = 2 - anti_aliasing;      Renamed, no longer %, no
+      #endif                                            longer mirrored in Fs.
+
+Version 0.1.0 (2013-01-19)
+  * First public release.
diff --git a/README b/README
new file mode 100644
index 0000000..7f9a7af
--- /dev/null
+++ b/README
@@ -0,0 +1,53 @@
+SoX Resampler Library       Copyright (c) 2007-18 robs@users.sourceforge.net
+
+The SoX Resampler library `libsoxr' performs one-dimensional sample-rate
+conversion -- it may be used, for example, to resample PCM-encoded audio.
+For higher-dimensional resampling, such as for visual-image processing, you
+should look elsewhere.
+
+It aims to give fastÂ¹ and very high qualityÂ² results for any constant
+(rational or irrational) resampling ratio.  Phase-response, preserved
+bandwidth, aliasing, and rejection level parameters are all configurable;
+alternatively, simple `preset' configurations may be selected.  A
+variable-rate resampling mode of operation is also included.
+
+The resampler is currently available either as part of `libsox' (the audio
+file-format and effect library), or stand-alone as `libsoxr' (this package).
+The interfaces to libsox and libsoxr are slightly different, with that of
+libsoxr designed specifically for resampling.  An application requiring
+support for other effects, or for reading-from or writing-to audio files or
+devices, should use libsox (or other libraries such as libsndfile or
+libavformat).
+
+Libsoxr provides a simple API that allows interfacing using the most
+commonly-used sample formats and buffering schemes: sample-formats may be
+either floating-point or integer, and multiple channels either interleaved
+or split in separate buffers.  The API is documented in the header file
+`soxr.h', together with sample code found in the 'examples' directory.
+
+For compatibility with the popular `libsamplerate' library, the header file
+`soxr-lsr.h' is provided and may be used as an alternative API.Â³  Note
+however, that libsoxr does not provide a full emulation of libsamplerate
+and that using this approach, only a sub-set of libsoxr's features are
+available.
+
+The design was inspired by Laurent De Soras' paper `The Quest For The
+Perfect Resampler', http://ldesoras.free.fr/doc/articles/resampler-en.pdf;
+in essence, it combines Julius O. Smith's `Bandlimited Interpolation'
+technique (https://ccrma.stanford.edu/~jos/resample/resample.pdf) with FFT-
+based over-sampling.
+
+Note that for real-time resampling, libsoxr may have a higher latency
+than non-FFT based resamplers.  For example, when using the `High Quality'
+configuration to resample between 44100Hz and 48000Hz, the latency is
+around 1000 output samples, i.e. roughly 20ms (though passband and FFT-
+size configuration parameters may be used to reduce this figure).
+
+For build and installation instructions, see the file `INSTALL'; for
+copyright and licensing information, see the file `LICENCE'.
+
+For support and new versions, see https://soxr.sourceforge.net
+________
+Â¹ For example, multi-channel resampling can utilise multiple CPU-cores.
+Â² Bit-perfect within practical occupied-bandwidth limits.
+Â³ For details of that API, see http://www.mega-nerd.com/SRC/api.html.
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..2d1bc19
--- /dev/null
+++ b/TODO
@@ -0,0 +1,3 @@
+* vr32s
+* vr32 with 1-delay-clear
+* fir_to_phase with RDFT32
diff --git a/cmake/Modules/FindCFlags.cmake b/cmake/Modules/FindCFlags.cmake
new file mode 100644
index 0000000..f118727
--- /dev/null
+++ b/cmake/Modules/FindCFlags.cmake
@@ -0,0 +1,35 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Function to find C compiler feature flags
+
+include (CheckCSourceCompiles)
+include (FindPackageHandleStandardArgs)
+
+function (FindCFlags PKG_NAME PKG_DESC TRIAL_C_FLAGS TEST_C_SOURCE)
+
+foreach (TRIAL_C_FLAG ${TRIAL_C_FLAGS})
+  message (STATUS "Trying ${PKG_NAME} C flags: ${TRIAL_C_FLAG}")
+  unset (DETECT_${PKG_NAME}_C_FLAGS CACHE) #displayed by check_c_source_compiles
+
+  set (TMP "${CMAKE_REQUIRED_FLAGS}")
+  set (CMAKE_REQUIRED_FLAGS "${TRIAL_C_FLAG}")
+  check_c_source_compiles ("${TEST_C_SOURCE}" DETECT_${PKG_NAME}_C_FLAGS)
+  set (CMAKE_REQUIRED_FLAGS "${TMP}")
+
+  if (DETECT_${PKG_NAME}_C_FLAGS)
+    set (DETECTED_C_FLAGS "${TRIAL_C_FLAG}")
+    break ()
+  endif ()
+endforeach ()
+
+# N.B. Will not overwrite existing cache variable:
+set (${PKG_NAME}_C_FLAGS "${DETECTED_C_FLAGS}"
+  CACHE STRING "C compiler flags for ${PKG_DESC}")
+
+find_package_handle_standard_args (
+  ${PKG_NAME} DEFAULT_MSG ${PKG_NAME}_C_FLAGS ${PKG_NAME}_C_FLAGS)
+mark_as_advanced (${PKG_NAME}_C_FLAGS)
+set (${PKG_NAME}_FOUND ${${PKG_NAME}_FOUND} PARENT_SCOPE)
+
+endfunction ()
diff --git a/cmake/Modules/FindLibAVCodec.cmake b/cmake/Modules/FindLibAVCodec.cmake
new file mode 100644
index 0000000..f1bbf89
--- /dev/null
+++ b/cmake/Modules/FindLibAVCodec.cmake
@@ -0,0 +1,23 @@
+# SoX Resampler Library       Copyright (c) 2007-18 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Find AVCODEC
+# Find the installation of this package: include-dirs and libraries.
+#
+#  AVCODEC_INCLUDE_DIRS - where to find headers for this package.
+#  AVCODEC_LIBRARIES    - libraries to link to when using this package.
+#  AVCODEC_FOUND        - true iff this package can be found.
+
+if (AVCODEC_INCLUDE_DIRS)
+  set (AVCODEC_FIND_QUIETLY TRUE)
+endif ()
+
+find_path (AVCODEC_INCLUDE_DIRS libavcodec/avcodec.h)
+
+find_library (AVCODEC_LIBRARIES NAMES avcodec)
+
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (
+  AVCODEC DEFAULT_MSG AVCODEC_LIBRARIES AVCODEC_INCLUDE_DIRS)
+
+mark_as_advanced (AVCODEC_LIBRARIES AVCODEC_INCLUDE_DIRS)
diff --git a/cmake/Modules/FindLibAVUtil.cmake b/cmake/Modules/FindLibAVUtil.cmake
new file mode 100644
index 0000000..464e6cf
--- /dev/null
+++ b/cmake/Modules/FindLibAVUtil.cmake
@@ -0,0 +1,23 @@
+# SoX Resampler Library       Copyright (c) 2007-18 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Find AVUTIL
+# Find the installation of this package: includes and libraries.
+#
+#  AVUTIL_INCLUDE_DIRS - where to find headers for this package.
+#  AVUTIL_LIBRARIES    - libraries to link to when using this package.
+#  AVUTIL_FOUND        - true iff this package can be found.
+
+if (AVUTIL_INCLUDE_DIRS)
+  set (AVUTIL_FIND_QUIETLY TRUE)
+endif ()
+
+find_path (AVUTIL_INCLUDE_DIRS libavutil/cpu.h)
+
+find_library (AVUTIL_LIBRARIES NAMES avutil)
+
+include (FindPackageHandleStandardArgs)
+find_package_handle_standard_args (
+  AVUTIL DEFAULT_MSG AVUTIL_LIBRARIES AVUTIL_INCLUDE_DIRS)
+
+mark_as_advanced (AVUTIL_LIBRARIES AVUTIL_INCLUDE_DIRS)
diff --git a/cmake/Modules/FindSIMD32.cmake b/cmake/Modules/FindSIMD32.cmake
new file mode 100644
index 0000000..9e42373
--- /dev/null
+++ b/cmake/Modules/FindSIMD32.cmake
@@ -0,0 +1,54 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Finds SIMD32 support
+#
+# The following variables are set:
+#   SIMD32_C_FLAGS - flags to add to the C compiler for this package.
+#   SIMD32_FOUND - true if support for this package is found.
+
+if (DEFINED SIMD32_C_FLAGS)
+  set (TRIAL_C_FLAGS)
+elseif (CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
+  set (TRIAL_C_FLAGS
+    # Gcc
+    "-mfpu=neon-vfpv4 -mcpu=cortex-a7"
+    "-mfpu=neon       -mfloat-abi=hard"
+    "-mfpu=neon       -mfloat-abi=softfp"
+    "-mfpu=neon       -mfloat-abi=soft"
+  )
+  set (TEST_C_SOURCE "
+    #include <arm_neon.h>
+    int main(int c, char * * v) {
+      float32x4_t a = vdupq_n_f32((float)c), b = vdupq_n_f32((float)!!v);
+      return !vgetq_lane_u32(vceqq_f32(a,b),0);
+    }
+  ")
+else ()
+  if (WIN32) # Safety for when mixed lib/app compilers (but performance hit)
+    set (GCC_WIN32_SIMD32_OPTS "-mincoming-stack-boundary=2")
+  endif ()
+
+  set (TRIAL_C_FLAGS
+    # x64
+    " "
+    # MSVC x86
+    "/arch:SSE /fp:fast -D__SSE__"
+    # Gcc x86
+    "-msse -mfpmath=sse ${GCC_WIN32_SIMD32_OPTS}"
+    # Gcc x86 (old versions)
+    "-msse -mfpmath=sse"
+  )
+  set (TEST_C_SOURCE "
+    #include <xmmintrin.h>
+    int main(int c, char * * v) {
+      __m128 a = _mm_set_ss((float)c), b = _mm_set_ss((float)!!v);
+      return _mm_comineq_ss(a,b);
+    }
+  ")
+endif ()
+
+include (FindCFlags)
+
+FindCFlags ("SIMD32" "FLOAT-32 (single-precision) SIMD vectorization"
+  "${TRIAL_C_FLAGS}" "${TEST_C_SOURCE}")
diff --git a/cmake/Modules/FindSIMD64.cmake b/cmake/Modules/FindSIMD64.cmake
new file mode 100644
index 0000000..d412644
--- /dev/null
+++ b/cmake/Modules/FindSIMD64.cmake
@@ -0,0 +1,29 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# - Finds SIMD64 support
+#
+# The following variables are set:
+#   SIMD64_C_FLAGS - flags to add to the C compiler for this package.
+#   SIMD64_FOUND - true if support for this package is found.
+
+if (DEFINED SIMD64_C_FLAGS OR CMAKE_SYSTEM_PROCESSOR MATCHES "^arm")
+  set (TRIAL_C_FLAGS)
+else ()
+  set (TRIAL_C_FLAGS
+    "-mavx" # Gcc
+    "/arch:AVX" # MSVC
+  )
+  set (TEST_C_SOURCE "
+    #ifndef __AVX__
+      #error
+    #endif
+    #include <immintrin.h>
+    int main() {return 0;}
+    ")
+endif ()
+
+include (FindCFlags)
+
+FindCFlags ("SIMD64" "FLOAT-64 (double-precision) SIMD vectorization"
+  "${TRIAL_C_FLAGS}" "${TEST_C_SOURCE}")
diff --git a/cmake/Modules/SetSystemProcessor.cmake b/cmake/Modules/SetSystemProcessor.cmake
new file mode 100644
index 0000000..8e2c292
--- /dev/null
+++ b/cmake/Modules/SetSystemProcessor.cmake
@@ -0,0 +1,37 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Sets CMAKE_SYSTEM_PROCESSOR for cross-compiling.
+
+macro (set_system_processor)
+  if (CMAKE_CROSSCOMPILING)
+    if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "" OR "${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "unknown")
+      unset(CMAKE_SYSTEM_PROCESSOR)
+    endif ()
+    if (NOT DEFINED CMAKE_SYSTEM_PROCESSOR)
+      include (CheckCSourceCompiles)
+      set (CPU_LINES
+        "#if defined __x86_64__ || defined _M_X64  /*\;x86_64\;*/"
+        "#if defined __i386__   || defined _M_IX86 /*\;x86_32\;*/"
+        "#if defined __arm__    || defined _M_ARM  /*\;arm\;*/"
+        )
+      foreach (CPU_LINE ${CPU_LINES})
+        string (CONCAT CPU_SOURCE "${CPU_LINE}" "
+        int main() {return 0;}
+        #endif
+        ")
+        unset (SYSTEM_PROCESSOR_DETECTED CACHE)
+        check_c_source_compiles ("${CPU_SOURCE}" SYSTEM_PROCESSOR_DETECTED)
+        if (SYSTEM_PROCESSOR_DETECTED)
+          list (GET CPU_LINE 1 CMAKE_SYSTEM_PROCESSOR)
+          message (STATUS "CMAKE_SYSTEM_PROCESSOR is ${CMAKE_SYSTEM_PROCESSOR}")
+          break ()
+        endif ()
+      endforeach ()
+    endif ()
+
+    # N.B. Will not overwrite existing cache variable:
+    set (CMAKE_SYSTEM_PROCESSOR "${CMAKE_SYSTEM_PROCESSOR}"
+      CACHE STRING "Target system processor")
+  endif ()
+endmacro ()
diff --git a/deinstall.cmake.in b/deinstall.cmake.in
new file mode 100644
index 0000000..307be50
--- /dev/null
+++ b/deinstall.cmake.in
@@ -0,0 +1,25 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+if (NOT EXISTS "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt")
+  message (FATAL_ERROR "Cannot find install manifest")
+endif ()
+
+file (READ "@CMAKE_CURRENT_BINARY_DIR@/install_manifest.txt" files)
+string (REGEX REPLACE "\n" ";" files "${files}")
+foreach (file ${files})
+  set (dest "$ENV{DESTDIR}${file}")
+  message (STATUS "Deinstalling \"${dest}\"")
+  if (EXISTS "${dest}" OR IS_SYMLINK "${dest}")
+    execute_process (
+      COMMAND "@CMAKE_COMMAND@" -E remove "${dest}"
+      OUTPUT_VARIABLE rm_out
+      RESULT_VARIABLE rm_retval
+    )
+    if (NOT ${rm_retval} EQUAL 0)
+      message (FATAL_ERROR "Problem when removing \"${dest}\"")
+    endif ()
+  else ()
+    message (STATUS "File \"${dest}\" does not exist.")
+  endif ()
+endforeach ()
diff --git a/examples/1-single-block.c b/examples/1-single-block.c
new file mode 100644
index 0000000..3b919b4
--- /dev/null
+++ b/examples/1-single-block.c
@@ -0,0 +1,50 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Example 1: `One-shot' resample a single block of data in memory.
+ *
+ * N.B. See example 2 for how to resample a stream (of blocks).
+ *
+ * Optional arguments are: INPUT-RATE OUTPUT-RATE
+ *
+ * With the default arguments, the output should produce lines similar to the
+ * following:
+ *
+ *  0.00  0.71  1.00  0.71 -0.00 -0.71 -1.00 -0.71
+ *
+ * Gibbs effect may be seen at the ends of the resampled signal; this is because
+ * unlike a `real-world' signal, the synthetic input signal is not band-limited.
+ */
+
+#include <soxr.h>
+#include "examples-common.h"
+
+const float in[] = {  /* Input: 12 cycles of a sine wave with freq. = irate/4 */
+  0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1,
+  0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1};
+
+int main(int argc, char const * arg[])
+{
+  double irate = argc > 1? atof(arg[1]) : 1;      /* Default to interpolation */
+  double orate = argc > 2? atof(arg[2]) : 2;             /* by a factor of 2. */
+
+  size_t olen = (size_t)(AL(in) * orate / irate + .5);   /* Assay output len. */
+  float * out = malloc(sizeof(*out) * olen);       /* Allocate output buffer. */
+  size_t odone;
+
+  soxr_error_t error = soxr_oneshot(irate, orate, 1, /* Rates and # of chans. */
+      in, AL(in), NULL,                              /* Input. */
+      out, olen, &odone,                             /* Output. */
+      NULL, NULL, NULL);                             /* Default configuration.*/
+
+  unsigned i = 0;                            /* Print out the resampled data, */
+  while (i++ < odone)
+    printf("%5.2f%c", out[i-1], " \n"[!(i&7) || i == odone]);
+  printf("%-26s %s\n", arg[0], soxr_strerror(error)); /* and reported result. */
+
+  if (argc > 3)                                     /* Library version check: */
+    printf("runtime=%s API="SOXR_THIS_VERSION_STR"\n", soxr_version());
+
+  free(out);                                                      /* Tidy up. */
+  return !!error;
+}
diff --git a/examples/1a-lsr.c b/examples/1a-lsr.c
new file mode 100644
index 0000000..6b50a8f
--- /dev/null
+++ b/examples/1a-lsr.c
@@ -0,0 +1,40 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Example 1a: Variant of example 1 using libsamplerate-like bindings. */
+
+#include <soxr-lsr.h>
+#include "examples-common.h"
+
+float in[] = {  /* Input: 12 cycles of a sine wave with freq. = irate/4 */
+  0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1,
+  0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1, 0,1,0,-1};
+
+int main(int argc, char const * arg[])
+{
+  double irate = argc > 1? atof(arg[1]) : 1;      /* Default to interpolation */
+  double orate = argc > 2? atof(arg[2]) : 2;             /* by a factor of 2. */
+
+  size_t olen = (size_t)(AL(in) * orate / irate + .5);   /* Assay output len. */
+  float * out = (float *)malloc(sizeof(*out) * olen); /* Allocate output buf. */
+
+  int error, i = 0;
+  SRC_DATA data;
+
+  data.data_in = in;
+  data.data_out = out;
+  data.input_frames = AL(in);
+  data.output_frames = (int)olen;
+  data.src_ratio = orate / irate;
+  error = src_simple(&data, SRC_SINC_FASTEST, 1);
+
+  while (i++ < data.output_frames_gen)       /* Print out the resampled data, */
+    printf("%5.2f%c", out[i-1], " \n"[!(i&7) || i == data.output_frames_gen]);
+  printf("%-26s %s\n", arg[0], src_strerror(error));  /* and reported result. */
+
+  if (argc > 3)                                     /* Library version check: */
+    printf("runtime=%s\n", src_get_version());
+
+  free(out);                                                      /* Tidy up. */
+  return !!error;
+}
diff --git a/examples/2-stream.C b/examples/2-stream.C
new file mode 100644
index 0000000..29c1bf6
--- /dev/null
+++ b/examples/2-stream.C
@@ -0,0 +1,78 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Example 2: resample a raw, single-channel, floating-point data stream from
+ * stdin to stdout.
+ *
+ * The application uses the single function `soxr_process' for both input and
+ * output to/from the resampler; compared to the `input function' approach
+ * (illustrated in example 3) this requires that the application implements
+ * more logic, but one less function.
+ *
+ * Arguments are: INPUT-RATE OUTPUT-RATE
+ */
+
+#include <soxr.h>
+#include "examples-common.h"
+
+int main(int argc, char const * arg[])
+{
+  double const irate = argc > 1? atof(arg[1]) : 96000.;
+  double const orate = argc > 2? atof(arg[2]) : 44100.;
+
+  /* Allocate resampling input and output buffers in proportion to the input
+   * and output rates: */
+  #define buf_total_len 15000  /* In samples. */
+  size_t const olen = (size_t)(orate * buf_total_len / (irate + orate) + .5);
+  size_t const ilen = buf_total_len - olen;
+  size_t const osize = sizeof(float), isize = osize;
+  void * obuf = malloc(osize * olen);
+  void * ibuf = malloc(isize * ilen);
+
+  size_t odone, written, need_input = 1;
+  soxr_error_t error;
+
+  /* Create a stream resampler: */
+  soxr_t soxr = soxr_create(
+      irate, orate, 1,             /* Input rate, output rate, # of channels. */
+      &error,                         /* To report any error during creation. */
+      NULL, NULL, NULL);                        /* Use configuration defaults.*/
+
+  if (!error) {                         /* If all is well, run the resampler: */
+    USE_STD_STDIO;
+                                                       /* Resample in blocks: */
+    do {
+      size_t ilen1 = 0;
+
+      if (need_input) {
+
+        /* Read one block into the buffer, ready to be resampled: */
+        ilen1 = fread(ibuf, isize, ilen, stdin);
+
+        if (!ilen1) {     /* If the is no (more) input data available, */
+          free(ibuf);     /* set ibuf to NULL, to indicate end-of-input */
+          ibuf = NULL;    /* to the resampler. */
+        }
+      }
+
+      /* Copy data from the input buffer into the resampler, and resample
+       * to produce as much output as is possible to the given output buffer: */
+      error = soxr_process(soxr, ibuf, ilen1, NULL, obuf, olen, &odone);
+
+      written = fwrite(obuf, osize, odone, stdout); /* Consume output.*/
+
+      /* If the actual amount of data output is less than that requested, and
+       * we have not already reached the end of the input data, then supply some
+       * more input next time round the loop: */
+      need_input = odone < olen && ibuf;
+
+    } while (!error && (need_input || written));
+  }
+                                                                  /* Tidy up: */
+  soxr_delete(soxr);
+  free(obuf), free(ibuf);
+                                                              /* Diagnostics: */
+  fprintf(stderr, "%-26s %s; I/O: %s\n", arg[0], soxr_strerror(error),
+      ferror(stdin) || ferror(stdout)? strerror(errno) : "no error");
+  return !!error;
+}
diff --git a/examples/3-options-input-fn.c b/examples/3-options-input-fn.c
new file mode 100644
index 0000000..afd43b9
--- /dev/null
+++ b/examples/3-options-input-fn.c
@@ -0,0 +1,114 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Example 3: extends example 2 with multiple channels, multiple datatypes,
+ * and other options.
+ *
+ * The application provides an input function, called on demand by libsoxr, in
+ * response to calls to soxr_output(); compared to the `process' approach
+ * (illustrated in example 2) this requires that the application implements
+ * less logic, but one more function.
+ *
+ * The 11 arguments (which are optional, from last to first) are:
+ *   INPUT-RATE       As example 2
+ *   OUTPUT-RATE      Ditto
+ *   NUM-CHANNELS     Number of interleaved channels
+ *   IN-DATATYPE#     0:float32 1:float64 2:int32 3:int16
+ *   OUT-DATATYPE#    Ditto; or 11 for un-dithered int16
+ *   Q-RECIPE         Quality recipe (in hex) See soxr.h
+ *   Q-FLAGS          Quality flags  (in hex) See soxr.h
+ *   PASSBAND-END     %
+ *   STOPBAND-BEGIN   %
+ *   PHASE-RESPONSE   [0,100]
+ *   USE-THREADS      1 to use multi-threading (where available)
+ */
+
+#include <soxr.h>
+#include "examples-common.h"
+
+typedef struct {void * ibuf; size_t isize;} input_context_t;
+
+static size_t input_fn(input_context_t * p, soxr_cbuf_t * buf, size_t len)
+{
+  /* Read one block into the buffer, ready to be input to the resampler: */
+  len = fread(p->ibuf, p->isize, len, stdin); /* Actual len read may be less. */
+
+  /* Inform the resampler of the data's whereabouts (which could be anywhere, in
+   * a freshly malloc'd buffer, for example): */
+  *buf = (!len && ferror(stdin))? NULL : p->ibuf;  /* NULL if error occurred. */
+
+  return len;                           /* # of samples per channel to input. */
+}
+
+int main(int n, char const * arg[])
+{
+  char const *     const arg0 = n? --n, *arg++ : "", * engine = "";
+  double          const irate = n? --n, atof(*arg++) : 96000.;
+  double          const orate = n? --n, atof(*arg++) : 44100.;
+  unsigned        const chans = n? --n, (unsigned)atoi(*arg++) : 1;
+  soxr_datatype_t const itype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned        const ospec = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned long const q_recipe= n? --n, strtoul(*arg++, 0, 16) : SOXR_HQ;
+  unsigned long const q_flags = n? --n, strtoul(*arg++, 0, 16) : 0;
+  double   const passband_end = n? --n, atof(*arg++) : 0;
+  double const stopband_begin = n? --n, atof(*arg++) : 0;
+  double const phase_response = n? --n, atof(*arg++) : -1;
+  int       const use_threads = n? --n, atoi(*arg++) : 1;
+  soxr_datatype_t const otype = ospec & 3;
+
+  soxr_quality_spec_t       q_spec = soxr_quality_spec(q_recipe, q_flags);
+  soxr_io_spec_t            io_spec = soxr_io_spec(itype, otype);
+  soxr_runtime_spec_t const runtime_spec = soxr_runtime_spec(!use_threads);
+
+  /* Allocate resampling input and output buffers in proportion to the input
+   * and output rates: */
+  #define buf_total_len 15000  /* In samples per channel. */
+  size_t const osize = soxr_datatype_size(otype) * chans;
+  size_t const isize = soxr_datatype_size(itype) * chans;
+  size_t const olen0= (size_t)(orate * buf_total_len / (irate + orate) + .5);
+  size_t const olen = min(max(olen0, 1), buf_total_len - 1);
+  size_t const ilen = buf_total_len - olen;
+  void * const obuf = malloc(osize * olen);
+  void * const ibuf = malloc(isize * ilen);
+
+  input_context_t icontext;
+  size_t odone, clips = 0;
+  soxr_error_t error;
+  soxr_t soxr;
+
+  /* Overrides (if given): */
+  if (passband_end   > 0) q_spec.passband_end   = passband_end / 100;
+  if (stopband_begin > 0) q_spec.stopband_begin = stopband_begin / 100;
+  if (phase_response >=0) q_spec.phase_response = phase_response;
+  io_spec.flags = ospec & ~7u;
+
+  /* Create a stream resampler: */
+  soxr = soxr_create(
+      irate, orate, chans,         /* Input rate, output rate, # of channels. */
+      &error,                         /* To report any error during creation. */
+      &io_spec, &q_spec, &runtime_spec);
+
+  if (!error) {                      /* Register input_fn with the resampler: */
+    icontext.ibuf = ibuf, icontext.isize = isize;
+    error = soxr_set_input_fn(soxr, (soxr_input_fn_t)input_fn, &icontext, ilen);
+  }
+
+  if (!error) {                         /* If all is well, run the resampler: */
+    engine = soxr_engine(soxr);
+    USE_STD_STDIO;
+                                                       /* Resample in blocks: */
+    do odone = soxr_output(soxr, obuf, olen);
+    while (fwrite(obuf, osize, odone, stdout));            /* Consume output. */
+
+    error = soxr_error(soxr);            /* Check if any soxr error occurred. */
+    clips = *soxr_num_clips(soxr);     /* Can occur only with integer output. */
+  }
+                                                                  /* Tidy up: */
+  soxr_delete(soxr);
+  free(obuf), free(ibuf);
+                                                              /* Diagnostics: */
+  fprintf(stderr, "%-26s %s; %lu clips; I/O: %s (%s)\n",
+      arg0, soxr_strerror(error), (long unsigned)clips,
+      ferror(stdin) || ferror(stdout)? strerror(errno) : "no error", engine);
+  return !!error;
+}
diff --git a/examples/4-split-channels.c b/examples/4-split-channels.c
new file mode 100644
index 0000000..a9022ce
--- /dev/null
+++ b/examples/4-split-channels.c
@@ -0,0 +1,161 @@
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Example 4: variant of examples 2 & 3, demonstrating I/O with split channels.
+ *
+ * Note that, for convenience of the demonstration, split-channel data is
+ * made available by deinterleaving data sourced from and sent to
+ * interleaved file-streams; this adds a lot of code to the example that,
+ * for purposes of understanding how to use split-channels, may safely be
+ * ignored.  In a real application, the channel-data might never be
+ * interleaved; for example, the split-channel data output from the
+ * resampler might be sent directly to digital-to-analogue converters.
+ *
+ * Note also (not shown in the examples) that split/interleaved channels may
+ * be used for input and output independently.
+ *
+ * Arguments are as example 3.
+ */
+
+#include <soxr.h>
+#include "examples-common.h"
+
+
+
+#define DEINTERLEAVE(T) do { \
+  unsigned i; \
+  size_t j; \
+  T * const * dest = (T * const *)dest0; \
+  T const * src = src0; \
+  if (ch == 1) memcpy(dest[0], src, n * sizeof(dest[0][0])); \
+  else for (j = 0; j < n; ++j) for (i = 0; i < ch; ++i) dest[i][j] = *src++; \
+  return; \
+} while (0)
+
+static void deinterleave(soxr_datatype_t data_type,
+    void * const * dest0,
+    void const * src0,
+    size_t n, unsigned ch)
+{
+  switch (data_type & 3) {
+    case SOXR_FLOAT32: DEINTERLEAVE(float);
+    case SOXR_FLOAT64: DEINTERLEAVE(double);
+    case SOXR_INT32  : DEINTERLEAVE(int32_t);
+    case SOXR_INT16  : DEINTERLEAVE(int16_t);
+    default: break;
+  }
+}
+
+#define INTERLEAVE(T) do { \
+  unsigned i; \
+  size_t j; \
+  T * dest = dest0; \
+  T const * const * src = (T const * const *)src0; \
+  if (ch == 1) memcpy(dest, src[0], n * sizeof(dest[0])); \
+  else for (j = 0; j < n; ++j) for (i = 0; i < ch; ++i) *dest++ = src[i][j]; \
+  return; \
+} while (0)
+
+static void interleave(soxr_datatype_t data_type, void * dest0,
+  void * const * src0, size_t n, unsigned ch)
+{
+  switch (data_type & 3) {
+    case SOXR_FLOAT32: INTERLEAVE(float);
+    case SOXR_FLOAT64: INTERLEAVE(double);
+    case SOXR_INT32  : INTERLEAVE(int32_t);
+    case SOXR_INT16  : INTERLEAVE(int16_t);
+    default: break;
+  }
+}
+
+int main(int n, char const * arg[])
+{
+  char const *     const arg0 = n? --n, *arg++ : "";
+  double          const irate = n? --n, atof(*arg++) : 96000.;
+  double          const orate = n? --n, atof(*arg++) : 44100.;
+  unsigned        const chans = n? --n, (unsigned)atoi(*arg++) : 1;
+  soxr_datatype_t const itype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned        const ospec = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned long const q_recipe= n? --n, strtoul(*arg++, 0, 16) : SOXR_HQ;
+  unsigned long const q_flags = n? --n, strtoul(*arg++, 0, 16) : 0;
+  double   const passband_end = n? --n, atof(*arg++) : 0;
+  double const stopband_begin = n? --n, atof(*arg++) : 0;
+  double const phase_response = n? --n, atof(*arg++) : -1;
+  int       const use_threads = n? --n, atoi(*arg++) : 1;
+  soxr_datatype_t const otype = ospec & 3;
+
+  soxr_quality_spec_t  q_spec = soxr_quality_spec(q_recipe, q_flags);
+  soxr_io_spec_t       io_spec=soxr_io_spec(itype|SOXR_SPLIT, otype|SOXR_SPLIT);
+  soxr_runtime_spec_t const runtime_spec = soxr_runtime_spec(!use_threads);
+
+  /* Allocate resampling input and output buffers in proportion to the input
+   * and output rates: */
+  #define buf_total_len 15000  /* In samples per channel. */
+  size_t const osize = soxr_datatype_size(otype) * chans;
+  size_t const isize = soxr_datatype_size(itype) * chans;
+  size_t const olen = (size_t)(orate * buf_total_len / (irate + orate) + .5);
+  size_t const ilen = buf_total_len - olen;
+
+  /* For split channels: */
+  void * * const obuf_ptrs = malloc(sizeof(void *) * chans);
+  void * *       ibuf_ptrs = malloc(sizeof(void *) * chans);
+  char * const obufs = malloc(osize * olen), * optr = obufs;
+  char * const ibufs = malloc(isize * ilen), * iptr = ibufs;
+
+  /* For interleaved channels: */
+  char * const obuf = malloc(osize * olen);
+  char * const ibuf = malloc(isize * ilen);
+
+  size_t odone, written, need_input = 1, clips = 0;
+  soxr_error_t error;
+  soxr_t soxr;
+  unsigned i;
+
+  /* Overrides (if given): */
+  if (passband_end   > 0) q_spec.passband_end   = passband_end / 100;
+  if (stopband_begin > 0) q_spec.stopband_begin = stopband_begin / 100;
+  if (phase_response >=0) q_spec.phase_response = phase_response;
+  io_spec.flags = ospec & ~7u;
+
+  soxr = soxr_create(
+      irate, orate, chans, &error, &io_spec, &q_spec, &runtime_spec);
+
+  for (i = 0; i < chans; ++i) {
+    ibuf_ptrs[i] = iptr;
+    obuf_ptrs[i] = optr;
+    iptr += ilen * soxr_datatype_size(itype);
+    optr += olen * soxr_datatype_size(otype);
+  }
+
+  if (!error) {
+    USE_STD_STDIO;
+
+    do {
+      size_t ilen1 = 0;
+
+      if (need_input) {
+        if (!(ilen1 = fread(ibuf, isize, ilen, stdin)))
+          free(ibuf_ptrs), ibuf_ptrs = 0; /* If none available, don't retry. */
+        else deinterleave(itype, ibuf_ptrs, ibuf, ilen1, chans);
+      }
+
+      error = soxr_process(soxr, ibuf_ptrs, ilen1, NULL, obuf_ptrs, olen, &odone);
+      interleave(otype, obuf, obuf_ptrs, odone, chans);  /* Consume output... */
+      written = fwrite(obuf, osize, odone, stdout);
+
+      need_input = odone < olen && ibuf_ptrs;
+
+    } while (!error && (need_input || written));
+
+    clips = *soxr_num_clips(soxr);     /* Can occur only with integer output. */
+  }
+                                                                  /* Tidy up: */
+  soxr_delete(soxr);
+  free(obuf), free(ibuf), free(obufs), free(ibufs);
+  free(obuf_ptrs), free(ibuf_ptrs);
+                                                              /* Diagnostics: */
+  fprintf(stderr, "%-26s %s; %lu clips; I/O: %s\n",
+      arg0, soxr_strerror(error), (long unsigned)clips,
+      ferror(stdin) || ferror(stdout)? strerror(errno) : "no error");
+  return !!error;
+}
diff --git a/examples/5-variable-rate.c b/examples/5-variable-rate.c
new file mode 100644
index 0000000..1a1c63f
--- /dev/null
+++ b/examples/5-variable-rate.c
@@ -0,0 +1,93 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Example 5:  Variable-rate resampling.  A test signal (held in a buffer) is
+ * resampled over a wide range of octaves.  Resampled data is sent to stdout as
+ * raw, float32 samples.  Choices of 2 test-signals and of 2 ways of varying
+ * the sample-rate are combined in a command-line option:
+ *
+ * Usage: ./5-variable-rate [0|1|2|3]
+ */
+
+#include <soxr.h>
+#include "examples-common.h"
+
+#define OCTAVES  5       /* Resampling range. Â± */
+#define OLEN     16      /* Output length in seconds. */
+#define FS       44100   /* Output sampling rate in Hz. */
+
+/* For output pos in [0,1], returns an ioratio in the 2^Â±OCTAVES range: */
+static double ioratio(double pos, int fm)
+{
+  if (fm) /* fm: non-0 for a fast-changing ioratio, 0 for a slow sweep. */
+    pos = .5 - cos(pos * 2 * M_PI) * .4 + sin(pos * OLEN * 20 * M_PI) * .05;
+  return pow(2, 2 * OCTAVES * pos - OCTAVES);
+}
+
+int main(int argc, char *arg[])
+{
+  int opt = argc <= 1? 2 : (atoi(arg[1]) & 3), saw = opt & 1, fm = opt & 2;
+  float ibuf[10 << OCTAVES], obuf[AL(ibuf)];
+  int i, wl = 2 << OCTAVES;
+  size_t ilen = AL(ibuf), need_input = 1, written;
+  size_t odone, total_odone, total_olen = OLEN * FS;
+  size_t olen1 = fm? 10 : AL(obuf); /* Small block-len if fast-changing ratio */
+  soxr_error_t error;
+
+  /* When creating a var-rate resampler, q_spec must be set as follows: */
+  soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_HQ, SOXR_VR);
+
+  /* The ratio of the given input rate and output rates must equate to the
+   * maximum I/O ratio that will be used: */
+  soxr_t soxr = soxr_create(1 << OCTAVES, 1, 1, &error, NULL, &q_spec, NULL);
+
+  if (!error) {
+    USE_STD_STDIO;
+
+    /* Generate input signal, sine or saw, with wave-length = wl: */
+    for (i = 0; i < (int)ilen; ++i)
+      ibuf[i] = (float)(saw? (i%wl)/(wl-1.)-.5 : .9 * sin(2 * M_PI * i / wl));
+
+    /* Set the initial resampling ratio (N.B. 3rd parameter = 0): */
+    soxr_set_io_ratio(soxr, ioratio(0, fm), 0);
+
+    /* Resample in blocks of size olen1: */
+    for (total_odone = 0; !error && total_odone < total_olen;) {
+
+      /* The last block might be shorter: */
+      size_t block_len = min(olen1, total_olen - total_odone);
+
+      /* Determine the position in [0,1] of the end of the current block: */
+      double pos = (double)(total_odone + block_len) / (double)total_olen;
+
+      /* Calculate an ioratio for this position and instruct the resampler to
+       * move smoothly to the new value, over the course of outputting the next
+       * 'block_len' samples (or give 0 for an instant change instead): */
+      soxr_set_io_ratio(soxr, ioratio(pos, fm), block_len);
+
+      /* Output the block of samples, supplying input samples as needed: */
+      do {
+        size_t len = need_input? ilen : 0;
+        error = soxr_process(soxr, ibuf, len, NULL, obuf, block_len, &odone);
+        written = fwrite(obuf, sizeof(float), odone, stdout);
+
+        /* Update counters for the current block and for the total length: */
+        block_len -= odone;
+        total_odone += odone;
+
+        /* If soxr_process did not provide the complete block, we must call it
+         * again, supplying more input samples: */
+        need_input = block_len != 0;
+
+      } while (need_input && !error && written == odone);
+
+      /* Now that the block for the current ioratio is complete, go back
+       * round the main `for' loop in order to process the next block. */
+    }
+    soxr_delete(soxr);
+  }
+                                                              /* Diagnostics: */
+  fprintf(stderr, "%-26s %s; I/O: %s\n", arg[0], soxr_strerror(error),
+      ferror(stdin) || ferror(stdout)? strerror(errno) : "no error");
+  return !!error;
+}
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
new file mode 100644
index 0000000..c8c17c9
--- /dev/null
+++ b/examples/CMakeLists.txt
@@ -0,0 +1,36 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${PROJECT_C_FLAGS}")
+set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${PROJECT_CXX_FLAGS}")
+link_libraries (${PROJECT_NAME} ${LIBM_LIBRARIES})
+
+if (${BUILD_EXAMPLES} OR ${BUILD_TESTS})
+  set (SOURCES 3-options-input-fn)
+  if (${WITH_LSR_BINDINGS})
+    set (LSR_SOURCES 1a-lsr)
+  endif ()
+endif ()
+
+if (${BUILD_EXAMPLES})
+  list (APPEND SOURCES 1-single-block 2-stream 4-split-channels)
+  if (${WITH_VR32})
+    list (APPEND SOURCES 5-variable-rate)
+  endif ()
+endif ()
+
+foreach (fe ${SOURCES} ${LSR_SOURCES})
+  get_filename_component (f ${fe} NAME_WE)
+  add_executable (${f} ${fe})
+  if (${f} STREQUAL "1a-lsr")
+    target_link_libraries (${f} soxr-lsr)
+  endif ()
+endforeach ()
+
+if (${BUILD_TESTS} AND ${WITH_LSR_BINDINGS})
+  add_test (lsr-bindings ${BIN}1a-lsr)
+endif ()
+
+file (GLOB INSTALL_SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.[cCh])
+install (FILES ${INSTALL_SOURCES} ${CMAKE_CURRENT_SOURCE_DIR}/README
+    DESTINATION ${DOC_INSTALL_DIR}/examples)
diff --git a/examples/README b/examples/README
new file mode 100644
index 0000000..a58939b
--- /dev/null
+++ b/examples/README
@@ -0,0 +1,20 @@
+SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+
+These simple examples show the different ways that an application may
+interface with soxr.  Note that real-world applications may also have to
+deal with file-formats, codecs, (more sophisticated) dithering, etc., which
+are not covered here.
+
+With the library installed, the examples may be built using commands similar
+to the following.  On unix-like systems:
+
+    cc 1-single-block.c -lsoxr
+    cc 1a-lsr.c -lsoxr-lsr
+
+or, with MSVC on MS-Windows:
+
+    cl 1-single-block.c -I"C:/Program Files/soxr/include" "C:/Program Files/soxr/lib/soxr.lib"
+    cl 1a-lsr.c -I"C:/Program Files/soxr/include" "C:/Program Files/soxr/lib/soxr-lsr.lib"
+
+IDEs may hide such commands behind configuration screens and build menus --
+where applicable, consult your IDE's user-manual.
diff --git a/examples/examples-common.h b/examples/examples-common.h
new file mode 100644
index 0000000..fc8ed82
--- /dev/null
+++ b/examples/examples-common.h
@@ -0,0 +1,47 @@
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Common includes etc. for the examples.  */
+
+#include <assert.h>
+#include <errno.h>
+#include <limits.h>
+#include <math.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#ifdef _WIN32
+  /* Work-around for broken file-I/O on MS-Windows: */
+  #include <io.h>
+  #include <fcntl.h>
+  #define USE_STD_STDIO _setmode(_fileno(stdout), _O_BINARY), \
+                        _setmode(_fileno(stdin ), _O_BINARY)
+#else
+  #define USE_STD_STDIO
+#endif
+
+#undef int16_t
+#define int16_t short
+
+#undef int32_t
+#if LONG_MAX > 2147483647L
+  #define int32_t int
+#elif LONG_MAX < 2147483647L
+  #error this programme requires that 'long int' has at least 32-bits
+#else
+  #define int32_t long
+#endif
+
+#undef min
+#define min(x,y) ((x)<(y)?(x):(y))
+
+#undef max
+#define max(x,y) ((x)>(y)?(x):(y))
+
+#undef AL
+#define AL(a) (sizeof(a)/sizeof((a)[0]))  /* Array Length */
+
+#undef M_PI /* Sometimes missing, so ensure that it is defined: */
+#define M_PI 3.14159265358979323846
diff --git a/go b/go
new file mode 100755
index 0000000..7fba810
--- /dev/null
+++ b/go
@@ -0,0 +1,18 @@
+#!/bin/sh
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+case "$1" in -j*) j="$1"; shift;; esac   # Support -jX for parallel build/test
+
+test x"$1" = x && build=Release || build="$1"
+
+rm -f CMakeCache.txt             # Prevent interference from any in-tree build
+
+mkdir -p "$build"
+cd "$build"
+
+cmake -Wno-dev -DCMAKE_BUILD_TYPE="$build" ..
+make $j
+ctest $j || echo "FAILURE details in $build/Testing/Temporary/LastTest.log"
diff --git a/go.bat b/go.bat
new file mode 100644
index 0000000..aabff75
--- /dev/null
+++ b/go.bat
@@ -0,0 +1,27 @@
+@echo off
+rem SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+rem Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+set build=%1
+if x%build% == x set build=Release
+
+rem Prevent interference from any in-tree build
+del/f CMakeCache.txt
+
+mkdir %build%
+cd %build%
+
+cmake -G "NMake Makefiles" -DCMAKE_BUILD_TYPE=%build% -Wno-dev ..
+if errorlevel 1 goto end
+
+nmake
+if errorlevel 1 goto end
+
+nmake test
+if errorlevel 1 goto error
+goto end
+
+:error
+echo FAILURE details in Testing\Temporary\LastTest.log
+
+:end
diff --git a/inst-check b/inst-check
new file mode 100755
index 0000000..8cf64b7
--- /dev/null
+++ b/inst-check
@@ -0,0 +1,25 @@
+#!/bin/sh
+set -e
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Sanity-check of library installed on unix-like system
+
+# This script checks the installation of the entire library (including lsr).
+#
+# Distros using three separate packages can do the following (in order):
+#
+# * Install soxr pkg (i.e. basically, just the shared object)
+# * ./inst-check-soxr
+# * Install soxr-lsr pkg (i.e. basically, just the shared object)
+# * ./inst-check-soxr-lsr
+# * Install the -dev pkg (i.e. examples, headers, & pkg-config)
+# * ./inst-check PATH-OF-INSTALLED-EXAMPLES-DIR (e.g. /usr/share/doc/libsoxr/examples)
+
+# Where are the example source files:
+src=$1
+test x$src = x && src=/usr/local/share/doc/libsoxr/examples
+
+dir="$(dirname $(readlink -f $0))"
+$dir/inst-check-soxr $src
+$dir/inst-check-soxr-lsr $src
diff --git a/inst-check-soxr b/inst-check-soxr
new file mode 100755
index 0000000..5f923b8
--- /dev/null
+++ b/inst-check-soxr
@@ -0,0 +1,52 @@
+#!/bin/sh
+set -e
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Sanity-check of sub-library installed on unix-like system
+
+arg="$1" # path to installed examples (if dev pkg installed); otherwise omitted
+dir="$(dirname $(readlink -f $0))"
+
+# Find the examples:
+src="$arg"
+test x"$src" = x && src="$dir/examples"
+cd $src
+
+# Somewhere to put the binaries:
+tmp=`mktemp -d`
+
+build_examples() {
+  if [ x"$arg" = x ]; then
+    echo "Examples in `pwd`; using local headers:" # for when dev pkg not installed
+    libs=-l$1
+    cflags=-I$dir/src
+  else
+    echo "Examples in `pwd`; using pkg-config:"
+    libs=$(pkg-config --libs $1)
+    cflags=$(pkg-config --cflags $1)
+  fi
+  for f in ?$2-*.[cC]; do
+    cc=cc; echo $f | grep -q C$ && cc=c++
+    out=$tmp/`echo $f | sed "s/.[cC]$//"`
+    cmd="$cc $cflags -o $out $f $libs -lm"
+    echo $cmd; $cmd
+  done
+}
+
+# Determine library:
+if [ `basename $0` = inst-check-soxr ]; then
+  build_examples soxr
+  gen="dd if=/dev/urandom count=1000"
+  $tmp/1-single-block 1 2 .
+  $gen 2> /dev/null | $tmp/2-stream                     2>&1 >$tmp/stdout
+  $gen 2> /dev/null | $tmp/3-options-input-fn 6 7 2 2 0 2>&1 >$tmp/stdout
+  $gen 2> /dev/null | $tmp/4-split-channels   7 6 2 2 3 2>&1 >$tmp/stdout  # Clipping expected here
+  $gen 2> /dev/null | $tmp/5-variable-rate              2>&1 >$tmp/stdout
+else
+  build_examples soxr-lsr a # lsr has 'a' suffix on example number.
+  $tmp/1a-lsr 1 2 .
+fi
+
+# Tidy up:
+rm -rf $tmp
diff --git a/inst-check-soxr-lsr b/inst-check-soxr-lsr
new file mode 120000
index 0000000..ec971fb
--- /dev/null
+++ b/inst-check-soxr-lsr
@@ -0,0 +1 @@
+inst-check-soxr
\ No newline at end of file
diff --git a/msvc/README b/msvc/README
new file mode 100644
index 0000000..5b7f60a
--- /dev/null
+++ b/msvc/README
@@ -0,0 +1,22 @@
+SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+
+Cmake is the recommended way to configure, build (as either a DLL or a static
+library), and install libsoxr for general use on MS-Windows, as on other OSs.
+
+However, building within MS Visual Studio is also possible, as exemplified by
+the accompanying files:
+
+ * soxr-config.h       Pre-configured for a modern Win32 system.
+
+ * libsoxr.vcproj      Builds the library as a DLL, per above.
+
+ * libsoxr.sln,        Build an example exe using the above.
+   example1.vcproj
+
+The following notes apply to adaptation of these files:
+
+ * For a system without AVX support, set WITH_CR64S to 0 in
+   soxr-config.h and exclude the three files ...64s.c from the build.
+
+ * If changing libsoxr.vcproj to build a static library, then also
+   remove the preprocessor definition: SOXR_DLL.
diff --git a/msvc/example1.vcproj b/msvc/example1.vcproj
new file mode 100644
index 0000000..1523855
--- /dev/null
+++ b/msvc/example1.vcproj
@@ -0,0 +1,82 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="example1"
+	ProjectGUID="{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}"
+	RootNamespace="soxr"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform Name="Win32" />
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="..\src"
+				PreprocessorDefinitions="WIN32;_DEBUG;_CONSOLE"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="2"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				TargetMachine="1"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="1"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="..\src"
+				PreprocessorDefinitions="WIN32;NDEBUG;_CONSOLE"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="1"
+				GenerateDebugInformation="true"
+				SubSystem="1"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<File RelativePath="..\examples\1-single-block.c" />
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff --git a/msvc/libsoxr.sln b/msvc/libsoxr.sln
new file mode 100644
index 0000000..bb3e9c7
--- /dev/null
+++ b/msvc/libsoxr.sln
@@ -0,0 +1,29 @@
+ï»¿
+Microsoft Visual Studio Solution File, Format Version 10.00
+# Visual C++ Express 2008
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "example1", "example1.vcproj", "{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}"
+	ProjectSection(ProjectDependencies) = postProject
+		{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB} = {4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}
+	EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libsoxr", "libsoxr.vcproj", "{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}"
+EndProject
+Global
+	GlobalSection(SolutionConfigurationPlatforms) = preSolution
+		Release|Win32 = Release|Win32
+		Debug|Win32 = Debug|Win32
+	EndGlobalSection
+	GlobalSection(ProjectConfigurationPlatforms) = postSolution
+		{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Release|Win32.ActiveCfg = Release|Win32
+		{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Release|Win32.Build.0 = Release|Win32
+		{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Debug|Win32.ActiveCfg = Debug|Win32
+		{CA28595B-B14F-45FD-BA56-FBDFFB70FFC4}.Debug|Win32.Build.0 = Debug|Win32
+		{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Release|Win32.ActiveCfg = Release|Win32
+		{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Release|Win32.Build.0 = Release|Win32
+		{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Debug|Win32.ActiveCfg = Debug|Win32
+		{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}.Debug|Win32.Build.0 = Debug|Win32
+	EndGlobalSection
+	GlobalSection(SolutionProperties) = preSolution
+		HideSolutionNode = FALSE
+	EndGlobalSection
+EndGlobal
diff --git a/msvc/libsoxr.vcproj b/msvc/libsoxr.vcproj
new file mode 100644
index 0000000..65ee42d
--- /dev/null
+++ b/msvc/libsoxr.vcproj
@@ -0,0 +1,97 @@
+<?xml version="1.0" encoding="Windows-1252"?>
+<VisualStudioProject
+	ProjectType="Visual C++"
+	Version="9.00"
+	Name="libsoxr"
+	ProjectGUID="{4916B0C1-2F99-433A-B88A-A99CB4E1E0AB}"
+	RootNamespace="libsoxr"
+	Keyword="Win32Proj"
+	TargetFrameworkVersion="196613"
+	>
+	<Platforms>
+		<Platform Name="Win32" />
+	</Platforms>
+	<ToolFiles>
+	</ToolFiles>
+	<Configurations>
+		<Configuration
+			Name="Debug|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="2"
+			CharacterSet="1"
+			>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="0"
+				AdditionalIncludeDirectories="."
+				PreprocessorDefinitions="_USE_MATH_DEFINES;_CRT_SECURE_NO_WARNINGS;SOXR_LIB;SOXR_DLL;soxr_EXPORTS"
+				MinimalRebuild="true"
+				BasicRuntimeChecks="3"
+				RuntimeLibrary="3"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="4"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="2"
+				GenerateDebugInformation="true"
+				SubSystem="2"
+				TargetMachine="1"
+			/>
+		</Configuration>
+		<Configuration
+			Name="Release|Win32"
+			OutputDirectory="$(SolutionDir)$(ConfigurationName)"
+			IntermediateDirectory="$(ConfigurationName)"
+			ConfigurationType="2"
+			CharacterSet="1"
+			WholeProgramOptimization="1"
+			>
+			<Tool
+				Name="VCCLCompilerTool"
+				Optimization="2"
+				EnableIntrinsicFunctions="true"
+				AdditionalIncludeDirectories="."
+				PreprocessorDefinitions="NDEBUG;_USE_MATH_DEFINES;_CRT_SECURE_NO_WARNINGS;SOXR_LIB;SOXR_DLL;soxr_EXPORTS"
+				RuntimeLibrary="2"
+				EnableFunctionLevelLinking="true"
+				UsePrecompiledHeader="0"
+				WarningLevel="3"
+				DebugInformationFormat="3"
+			/>
+			<Tool
+				Name="VCLinkerTool"
+				LinkIncremental="1"
+				GenerateDebugInformation="true"
+				SubSystem="2"
+				OptimizeReferences="2"
+				EnableCOMDATFolding="2"
+				TargetMachine="1"
+			/>
+		</Configuration>
+	</Configurations>
+	<References>
+	</References>
+	<Files>
+		<File RelativePath="..\src\cr.c" />
+		<File RelativePath="..\src\cr32.c" />
+		<File RelativePath="..\src\cr32s.c" />
+		<File RelativePath="..\src\cr64.c" />
+		<File RelativePath="..\src\cr64s.c" />
+		<File RelativePath="..\src\data-io.c" />
+		<File RelativePath="..\src\dbesi0.c" />
+		<File RelativePath="..\src\fft4g32.c" />
+		<File RelativePath="..\src\fft4g64.c" />
+		<File RelativePath="..\src\filter.c" />
+		<File RelativePath="..\src\pffft32s.c" />
+		<File RelativePath="..\src\pffft64s.c" />
+		<File RelativePath="..\src\util32s.c" />
+		<File RelativePath="..\src\util64s.c" />
+		<File RelativePath="..\src\soxr.c" />
+		<File RelativePath="..\src\vr32.c" />
+	</Files>
+	<Globals>
+	</Globals>
+</VisualStudioProject>
diff --git a/msvc/soxr-config.h b/msvc/soxr-config.h
new file mode 100644
index 0000000..89f7a91
--- /dev/null
+++ b/msvc/soxr-config.h
@@ -0,0 +1,30 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* N.B. Pre-configured for modern MS-Windows systems.  However, the normal
+ * procedure is to use the cmake configuration and build system. See INSTALL. */
+
+#if !defined soxr_config_included
+#define soxr_config_included
+
+#define AVCODEC_FOUND 0
+#define AVUTIL_FOUND 0
+#define WITH_PFFFT 1
+
+#define HAVE_FENV_H 1
+#define HAVE_STDBOOL_H 1
+#define HAVE_STDINT_H 1
+#define HAVE_LRINT 1
+#define HAVE_BIGENDIAN 0
+
+#define WITH_CR32 1
+#define WITH_CR32S 1
+#define WITH_CR64 1
+#define WITH_CR64S 1
+#define WITH_VR32 1
+
+#define WITH_HI_PREC_CLOCK 1
+#define WITH_FLOAT_STD_PREC_CLOCK 0
+#define WITH_DEV_TRACE 1
+
+#endif
diff --git a/multi-arch b/multi-arch
new file mode 100755
index 0000000..288b578
--- /dev/null
+++ b/multi-arch
@@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+rm -f CMakeCache.txt             # Prevent interference from any in-tree build
+
+j=-j4
+build=Release
+
+for n in \
+    cc: \
+    clang: \
+    arm-linux-gnueabi-gcc:Linux \
+    x86_64-w64-mingw32-gcc:Windows \
+    i686-w64-mingw32-gcc:Windows \
+    ; do
+  compiler=$(echo $n | sed 's/:.*//')
+  system=$(echo $n | sed 's/.*://')
+  dir=$build-$compiler
+  which $compiler > /dev/null || echo $compiler not found && (
+  echo "***" $dir
+  mkdir -p $dir
+    cd $dir
+    cmake -DCMAKE_BUILD_TYPE=$build -DCMAKE_C_COMPILER=$compiler -DCMAKE_SYSTEM_NAME="$system" -DBUILD_SHARED_LIBS=OFF -DWITH_OPENMP=OFF ..
+    make $j && [ /$system = / ] && ctest -j || true
+    cd tests
+    ../../tests/throughput-test && SOXR_THROUGHPUT_GAIN=.6 ../../tests/throughput-test 2 3 || true
+  )
+done
diff --git a/soxr-config.h.in b/soxr-config.h.in
new file mode 100644
index 0000000..00b3b45
--- /dev/null
+++ b/soxr-config.h.in
@@ -0,0 +1,27 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_config_included
+#define soxr_config_included
+
+#cmakedefine01 AVCODEC_FOUND
+#cmakedefine01 AVUTIL_FOUND
+#cmakedefine01 WITH_PFFFT
+
+#cmakedefine01 HAVE_FENV_H
+#cmakedefine01 HAVE_STDBOOL_H
+#cmakedefine01 HAVE_STDINT_H
+#cmakedefine01 HAVE_LRINT
+#cmakedefine01 HAVE_BIGENDIAN
+
+#cmakedefine01 WITH_CR32
+#cmakedefine01 WITH_CR32S
+#cmakedefine01 WITH_CR64
+#cmakedefine01 WITH_CR64S
+#cmakedefine01 WITH_VR32
+
+#cmakedefine01 WITH_HI_PREC_CLOCK
+#cmakedefine01 WITH_FLOAT_STD_PREC_CLOCK
+#cmakedefine01 WITH_DEV_TRACE
+
+#endif
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 0000000..bb01a0d
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,129 @@
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+
+
+# Can generate vr-coefs.h but it complicates cross-compiling & non-cmake builds
+
+if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/vr-coefs.h)
+  include_directories(${CMAKE_CURRENT_BINARY_DIR})
+  set_property(SOURCE vr32.c
+      APPEND PROPERTY OBJECT_DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h)
+  add_executable (vr-coefs vr-coefs.c)
+  target_link_libraries (vr-coefs ${LIBM_LIBRARIES})
+  ADD_CUSTOM_COMMAND(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h
+    COMMAND vr-coefs > ${CMAKE_CURRENT_BINARY_DIR}/vr-coefs.h
+    DEPENDS vr-coefs)
+endif ()
+
+
+
+add_definitions (${PROJECT_C_FLAGS} -DSOXR_LIB)
+
+
+
+# Libsoxr configuration:
+
+set (RDFT32 fft4g32)
+if (AVCODEC_FOUND)
+  set (RDFT32 avfft32)
+  set (RDFT32S avfft32s)
+elseif (WITH_PFFFT)
+  #set (RDFT32 pffft32)
+  set (RDFT32S pffft32s)
+elseif (WITH_CR32S)
+  set (RDFT32S fft4g32s)
+  if (NOT WITH_CR32)
+    list (APPEND RDFT32S fft4g32)
+  endif ()
+endif ()
+
+set (SOURCES ${PROJECT_NAME}.c data-io)
+
+if (WITH_CR32 OR WITH_CR32S OR WITH_CR64 OR WITH_CR64S)
+  list (APPEND SOURCES dbesi0 filter fft4g64 cr)
+endif ()
+
+if (WITH_CR32)
+  list (APPEND SOURCES cr32 ${RDFT32})
+endif ()
+
+if (WITH_CR64)
+  list (APPEND SOURCES cr64)
+endif ()
+
+if (WITH_VR32)
+  list (APPEND SOURCES vr32)
+endif ()
+
+if (WITH_CR32S)
+  foreach (source cr32s ${RDFT32S} util32s)
+    list (APPEND SOURCES ${source})
+    set_property (SOURCE ${source}
+        APPEND_STRING PROPERTY COMPILE_FLAGS ${SIMD32_C_FLAGS})
+  endforeach ()
+endif ()
+
+if (WITH_CR64S)
+  foreach (source cr64s pffft64s util64s)
+    list (APPEND SOURCES ${source})
+    set_property (SOURCE ${source}
+        APPEND_STRING PROPERTY COMPILE_FLAGS ${SIMD64_C_FLAGS})
+  endforeach ()
+endif ()
+
+
+
+# Libsoxr:
+
+add_library (${PROJECT_NAME} ${LIB_TYPE} ${SOURCES})
+target_link_libraries (${PROJECT_NAME} PRIVATE ${LIBS} ${LIBM_LIBRARIES})
+set_target_properties (${PROJECT_NAME} PROPERTIES
+  VERSION "${SO_VERSION}"
+  SOVERSION ${SO_VERSION_MAJOR}
+  INSTALL_NAME_DIR ${LIB_INSTALL_DIR}
+  LINK_INTERFACE_LIBRARIES ""
+  PUBLIC_HEADER "${PROJECT_NAME}.h")
+if (BUILD_FRAMEWORK)
+  set_target_properties (${PROJECT_NAME} PROPERTIES FRAMEWORK TRUE)
+elseif (NOT WIN32)
+  set (TARGET_PCS ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc)
+  configure_file (${CMAKE_CURRENT_SOURCE_DIR}/${PROJECT_NAME}.pc.in ${TARGET_PCS})
+  install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${PROJECT_NAME}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig)
+endif ()
+
+
+
+# LSR bindings:
+
+if (WITH_LSR_BINDINGS)
+  set (LSR ${PROJECT_NAME}-lsr)
+  set (LSR_SO_VERSION 0.1.9)
+  set (LSR_SO_VERSION_MAJOR 0)
+  add_library (${LSR} ${LIB_TYPE} ${LSR})
+  target_link_libraries (${LSR} ${PROJECT_NAME})
+  set_target_properties (${LSR} PROPERTIES
+    VERSION "${LSR_SO_VERSION}"
+    SOVERSION ${LSR_SO_VERSION_MAJOR}
+    INSTALL_NAME_DIR ${LIB_INSTALL_DIR}
+    LINK_INTERFACE_LIBRARIES ""
+    PUBLIC_HEADER "${LSR}.h")
+  if (BUILD_FRAMEWORK)
+    set_target_properties (${LSR} PROPERTIES FRAMEWORK TRUE)
+  elseif (NOT WIN32)
+    set (TARGET_PCS "${TARGET_PCS} ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc")
+    configure_file (${CMAKE_CURRENT_SOURCE_DIR}/${LSR}.pc.in ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc)
+    install (FILES ${CMAKE_CURRENT_BINARY_DIR}/${LSR}.pc DESTINATION ${LIB_INSTALL_DIR}/pkgconfig)
+  endif ()
+endif ()
+
+
+
+# Installation (from build from source):
+
+install (TARGETS ${PROJECT_NAME} ${LSR}
+  FRAMEWORK DESTINATION ${FRAMEWORK_INSTALL_DIR}
+  LIBRARY DESTINATION ${LIB_INSTALL_DIR}
+  RUNTIME DESTINATION ${BIN_INSTALL_DIR}
+  ARCHIVE DESTINATION ${LIB_INSTALL_DIR}
+  PUBLIC_HEADER DESTINATION ${INCLUDE_INSTALL_DIR})
diff --git a/src/aliases.h b/src/aliases.h
new file mode 100644
index 0000000..d1a392f
--- /dev/null
+++ b/src/aliases.h
@@ -0,0 +1,39 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if defined SOXR_LIB
+
+#define lsx_bessel_I_0                 _soxr_bessel_I_0
+#define lsx_cdft_f                     _soxr_cdft_f
+#define lsx_cdft                       _soxr_cdft
+#define lsx_clear_fft_cache_f          _soxr_clear_fft_cache_f
+#define lsx_clear_fft_cache            _soxr_clear_fft_cache
+#define lsx_ddct_f                     _soxr_ddct_f
+#define lsx_ddct                       _soxr_ddct
+#define lsx_ddst_f                     _soxr_ddst_f
+#define lsx_ddst                       _soxr_ddst
+#define lsx_design_lpf                 _soxr_design_lpf
+#define lsx_dfct_f                     _soxr_dfct_f
+#define lsx_dfct                       _soxr_dfct
+#define lsx_dfst_f                     _soxr_dfst_f
+#define lsx_dfst                       _soxr_dfst
+#define lsx_fir_to_phase               _soxr_fir_to_phase
+#define lsx_f_resp                     _soxr_f_resp
+#define lsx_init_fft_cache_f           _soxr_init_fft_cache_f
+#define lsx_init_fft_cache             _soxr_init_fft_cache
+#define lsx_inv_f_resp                 _soxr_inv_f_resp
+#define lsx_kaiser_beta                _soxr_kaiser_beta
+#define lsx_kaiser_params              _soxr_kaiser_params
+#define lsx_make_lpf                   _soxr_make_lpf
+#define lsx_ordered_convolve_f         _soxr_ordered_convolve_f
+#define lsx_ordered_convolve           _soxr_ordered_convolve
+#define lsx_ordered_partial_convolve_f _soxr_ordered_partial_convolve_f
+#define lsx_ordered_partial_convolve   _soxr_ordered_partial_convolve
+#define lsx_rdft_f                     _soxr_rdft_f
+#define lsx_rdft                       _soxr_rdft
+#define lsx_safe_cdft_f                _soxr_safe_cdft_f
+#define lsx_safe_cdft                  _soxr_safe_cdft
+#define lsx_safe_rdft_f                _soxr_safe_rdft_f
+#define lsx_safe_rdft                  _soxr_safe_rdft
+
+#endif
diff --git a/src/avfft32.c b/src/avfft32.c
new file mode 100644
index 0000000..fe651f5
--- /dev/null
+++ b/src/avfft32.c
@@ -0,0 +1,33 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <stdlib.h>
+#include <math.h>
+#include <libavcodec/avfft.h>
+#include "filter.h"
+#include "rdft_t.h"
+
+static void * forward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),DFT_R2C);}
+static void * backward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),IDFT_C2R);}
+static void rdft(int length, void * setup, float * h) {av_rdft_calc(setup, h); (void)length;}
+static int multiplier(void) {return 2;}
+static void nothing(void) {}
+static int flags(void) {return 0;}
+
+fn_t _soxr_rdft32_cb[] = {
+  (fn_t)forward_setup,
+  (fn_t)backward_setup,
+  (fn_t)av_rdft_end,
+  (fn_t)rdft,
+  (fn_t)rdft,
+  (fn_t)rdft,
+  (fn_t)rdft,
+  (fn_t)_soxr_ordered_convolve_f,
+  (fn_t)_soxr_ordered_partial_convolve_f,
+  (fn_t)multiplier,
+  (fn_t)nothing,
+  (fn_t)malloc,
+  (fn_t)calloc,
+  (fn_t)free,
+  (fn_t)flags,
+};
diff --git a/src/avfft32s.c b/src/avfft32s.c
new file mode 100644
index 0000000..5a7e62d
--- /dev/null
+++ b/src/avfft32s.c
@@ -0,0 +1,32 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <math.h>
+#include <libavcodec/avfft.h>
+#include "util32s.h"
+#include "rdft_t.h"
+
+static void * forward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),DFT_R2C);}
+static void * backward_setup(int len) {return av_rdft_init((int)(log(len)/log(2)+.5),IDFT_C2R);}
+static void rdft(int length, void * setup, float * h) {av_rdft_calc(setup, h); (void)length;}
+static int multiplier(void) {return 2;}
+static void nothing(void) {}
+static int flags(void) {return RDFT_IS_SIMD;}
+
+fn_t _soxr_rdft32s_cb[] = {
+  (fn_t)forward_setup,
+  (fn_t)backward_setup,
+  (fn_t)av_rdft_end,
+  (fn_t)rdft,
+  (fn_t)rdft,
+  (fn_t)rdft,
+  (fn_t)rdft,
+  (fn_t)ORDERED_CONVOLVE_SIMD,
+  (fn_t)ORDERED_PARTIAL_CONVOLVE_SIMD,
+  (fn_t)multiplier,
+  (fn_t)nothing,
+  (fn_t)SIMD_ALIGNED_MALLOC,
+  (fn_t)SIMD_ALIGNED_CALLOC,
+  (fn_t)SIMD_ALIGNED_FREE,
+  (fn_t)flags,
+};
diff --git a/src/ccrw2.h b/src/ccrw2.h
new file mode 100644
index 0000000..09331a4
--- /dev/null
+++ b/src/ccrw2.h
@@ -0,0 +1,75 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Concurrent Control with "Readers" and "Writers", P.J. Courtois et al, 1971 */
+
+#if !defined soxr_ccrw2_included
+#define soxr_ccrw2_included
+
+#if defined SOXR_LIB
+#include "internal.h"
+#endif
+
+#if defined _OPENMP
+
+#include <omp.h>
+
+typedef struct {
+  int readcount, writecount; /* initial value = 0 */
+  omp_lock_t mutex_1, mutex_2, mutex_3, w, r; /* initial value = 1 */
+} ccrw2_t; /* Problem #2: `writers-preference' */
+
+#define ccrw2_become_reader(p) do {\
+  omp_set_lock(&p.mutex_3);\
+    omp_set_lock(&p.r);\
+      omp_set_lock(&p.mutex_1);\
+        if (++p.readcount == 1) omp_set_lock(&p.w);\
+      omp_unset_lock(&p.mutex_1);\
+    omp_unset_lock(&p.r);\
+  omp_unset_lock(&p.mutex_3);\
+} while (0)
+#define ccrw2_cease_reading(p) do {\
+  omp_set_lock(&p.mutex_1);\
+    if (!--p.readcount) omp_unset_lock(&p.w);\
+  omp_unset_lock(&p.mutex_1);\
+} while (0)
+#define ccrw2_become_writer(p) do {\
+  omp_set_lock(&p.mutex_2);\
+    if (++p.writecount == 1) omp_set_lock(&p.r);\
+  omp_unset_lock(&p.mutex_2);\
+  omp_set_lock(&p.w);\
+} while (0)
+#define ccrw2_cease_writing(p) do {\
+  omp_unset_lock(&p.w);\
+  omp_set_lock(&p.mutex_2);\
+    if (!--p.writecount) omp_unset_lock(&p.r);\
+  omp_unset_lock(&p.mutex_2);\
+} while (0)
+#define ccrw2_init(p) do {\
+  omp_init_lock(&p.mutex_1);\
+  omp_init_lock(&p.mutex_2);\
+  omp_init_lock(&p.mutex_3);\
+  omp_init_lock(&p.w);\
+  omp_init_lock(&p.r);\
+} while (0)
+#define ccrw2_clear(p) do {\
+  omp_destroy_lock(&p.r);\
+  omp_destroy_lock(&p.w);\
+  omp_destroy_lock(&p.mutex_3);\
+  omp_destroy_lock(&p.mutex_2);\
+  omp_destroy_lock(&p.mutex_1);\
+} while (0)
+
+#else
+
+typedef int ccrw2_t;
+#define ccrw2_become_reader(x) (void)(x)
+#define ccrw2_cease_reading(x) (void)(x)
+#define ccrw2_become_writer(x) (void)(x)
+#define ccrw2_cease_writing(x) (void)(x)
+#define ccrw2_init(x) (void)(x)
+#define ccrw2_clear(x) (void)(x)
+
+#endif /* _OPENMP */
+
+#endif
diff --git a/src/cr-core.c b/src/cr-core.c
new file mode 100644
index 0000000..159a5d9
--- /dev/null
+++ b/src/cr-core.c
@@ -0,0 +1,314 @@
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details.
+ *
+ * Constant-rate resampling engine-specific code. */
+
+#include <math.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "filter.h"
+
+#if defined SOXR_LIB
+  #include "internal.h"
+  #include "cr.h"
+  #if CORE_TYPE & CORE_DBL
+    typedef double sample_t;
+    #if CORE_TYPE & CORE_SIMD_DFT
+      #define RDFT_CB    _soxr_rdft64s_cb
+    #else
+      #define RDFT_CB    _soxr_rdft64_cb
+    #endif
+  #else
+    typedef float sample_t;
+    #if CORE_TYPE & CORE_SIMD_DFT
+      #define RDFT_CB    _soxr_rdft32s_cb
+    #else
+      #define RDFT_CB    _soxr_rdft32_cb
+    #endif
+  #endif
+
+  #if CORE_TYPE & (CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT)
+    #if CORE_TYPE & CORE_DBL
+      #include "util64s.h"
+      #include "dev64s.h"
+    #else
+      #include "util32s.h"
+      #include "dev32s.h"
+    #endif
+  #endif
+
+  extern fn_t RDFT_CB[];
+#else
+  #define RDFT_CB 0
+#endif
+
+
+
+static void cubic_stage_fn(stage_t * p, fifo_t * output_fifo)
+{
+  sample_t const * input = stage_read_p(p);
+  int num_in = min(stage_occupancy(p), p->input_size);
+  int i, max_num_out = 1 + (int)(num_in * p->out_in_ratio);
+  sample_t * output = fifo_reserve(output_fifo, max_num_out);
+
+  for (i = 0; p->at.integer < num_in; ++i, p->at.whole += p->step.whole) {
+    sample_t const * s = input + p->at.integer;
+    double x = p->at.fraction * (1 / MULT32);
+    double b = .5*(s[1]+s[-1])-*s, a = (1/6.)*(s[2]-s[1]+s[-1]-*s-4*b);
+    double c = s[1]-*s-a-b;
+    output[i] = (sample_t)(p->mult * (((a*x + b)*x + c)*x + *s));
+  }
+  assert(max_num_out - i >= 0);
+  fifo_trim_by(output_fifo, max_num_out - i);
+  fifo_read(&p->fifo, p->at.integer, NULL);
+  p->at.integer = 0;
+}
+
+
+
+#if defined __AVX__
+  #define DEFINED_AVX 1
+#else
+  #define DEFINED_AVX 0
+#endif
+
+#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86
+  #define DEFINED_X86 1
+#else
+  #define DEFINED_X86 0
+#endif
+
+#if defined __arm__
+  #define DEFINED_ARM 1
+#else
+  #define DEFINED_ARM 0
+#endif
+
+
+
+#if CORE_TYPE & CORE_DBL
+  #define SIMD_AVX ((CORE_TYPE & CORE_SIMD_HALF) && DEFINED_AVX)
+  #define SIMD_SSE 0
+#else
+  #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_HALF) && DEFINED_X86)
+  #define SIMD_AVX 0
+#endif
+
+#define SIMD_NEON ((CORE_TYPE & CORE_SIMD_HALF) && DEFINED_ARM)
+
+
+
+#include "half-coefs.h"
+
+#if !(CORE_TYPE & CORE_SIMD_HALF)
+#define FUNCTION_H h7
+#define CONVOLVE ____ __ _
+#include "half-fir.h"
+#endif
+
+#define FUNCTION_H h8
+#define CONVOLVE ____ ____
+#include "half-fir.h"
+
+#define FUNCTION_H h9
+#define CONVOLVE ____ ____ _
+#include "half-fir.h"
+
+#if CORE_TYPE & CORE_DBL
+  #define FUNCTION_H h10
+  #define CONVOLVE ____ ____ __
+  #include "half-fir.h"
+
+  #define FUNCTION_H h11
+  #define CONVOLVE ____ ____ __ _
+  #include "half-fir.h"
+
+  #define FUNCTION_H h12
+  #define CONVOLVE ____ ____ ____
+  #include "half-fir.h"
+
+  #define FUNCTION_H h13
+  #define CONVOLVE ____ ____ ____ _
+  #include "half-fir.h"
+#endif
+
+static half_fir_info_t const half_firs[] = {
+#if !(CORE_TYPE & CORE_SIMD_HALF)
+  { 7, half_fir_coefs_7 , h7 , 0  , 120.65f},
+#endif
+  { 8, half_fir_coefs_8 , h8 , 0  , 136.51f},
+  { 9, half_fir_coefs_9 , h9 , 0  , 152.32f},
+#if CORE_TYPE & CORE_DBL
+  {10, half_fir_coefs_10, h10, 0  , 168.08f},
+  {11, half_fir_coefs_11, h11, 0  , 183.79f},
+  {12, half_fir_coefs_12, h12, 0  , 199.46f},
+  {13, half_fir_coefs_13, h13, 0  , 215.12f},
+#endif
+};
+
+#undef SIMD_AVX
+#undef SIMD_NEON
+#undef SIMD_SSE
+
+
+
+#if CORE_TYPE & CORE_DBL
+  #define SIMD_AVX ((CORE_TYPE & CORE_SIMD_POLY) && DEFINED_AVX)
+  #define SIMD_SSE 0
+#else
+  #define SIMD_SSE ((CORE_TYPE & CORE_SIMD_POLY) && DEFINED_X86)
+  #define SIMD_AVX 0
+#endif
+
+#define SIMD_NEON ((CORE_TYPE & CORE_SIMD_POLY) && DEFINED_ARM)
+
+
+
+#define COEFS (sample_t * __restrict)p->shared->poly_fir_coefs
+#define VAR_LENGTH p->n
+#define VAR_CONVOLVE(n) while (j < (n)) _
+#define VAR_POLY_PHASE_BITS p->phase_bits
+
+
+
+#define FUNCTION vpoly0
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir0.h"
+
+#define FUNCTION vpoly1
+#define COEF_INTERP 1
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir.h"
+
+#define FUNCTION vpoly2
+#define COEF_INTERP 2
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir.h"
+
+#define FUNCTION vpoly3
+#define COEF_INTERP 3
+#define PHASE_BITS VAR_POLY_PHASE_BITS
+#define FIR_LENGTH VAR_LENGTH
+#define CONVOLVE(n) VAR_CONVOLVE(n)
+#include "poly-fir.h"
+
+
+
+#if !(CORE_TYPE & CORE_SIMD_POLY)
+
+#define poly_fir_convolve_U100 _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _
+#define FUNCTION U100_0
+#define FIR_LENGTH U100_l
+#define CONVOLVE(n) poly_fir_convolve_U100
+#include "poly-fir0.h"
+
+#define u100_l 11
+#define poly_fir_convolve_u100 _ _ _ _ _ _ _ _ _ _ _
+#define FUNCTION u100_0
+#define FIR_LENGTH u100_l
+#define CONVOLVE(n) poly_fir_convolve_u100
+#include "poly-fir0.h"
+
+#define FUNCTION u100_1
+#define COEF_INTERP 1
+#define PHASE_BITS 8
+#define FIR_LENGTH u100_l
+#define CONVOLVE(n) poly_fir_convolve_u100
+#include "poly-fir.h"
+
+#define FUNCTION u100_2
+#define COEF_INTERP 2
+#define PHASE_BITS 6
+#define FIR_LENGTH u100_l
+#define CONVOLVE(n) poly_fir_convolve_u100
+#include "poly-fir.h"
+
+#endif
+
+#define u100_1_b 8
+#define u100_2_b 6
+
+
+
+static poly_fir_t const poly_firs[] = {
+  {-1, {{0, vpoly0}, { 7.2f, vpoly1}, {5.0f, vpoly2}}},
+  {-1, {{0, vpoly0}, { 9.4f, vpoly1}, {6.7f, vpoly2}}},
+  {-1, {{0, vpoly0}, {12.4f, vpoly1}, {7.8f, vpoly2}}},
+  {-1, {{0, vpoly0}, {13.6f, vpoly1}, {9.3f, vpoly2}}},
+  {-1, {{0, vpoly0}, {10.5f, vpoly2}, {8.4f, vpoly3}}},
+  {-1, {{0, vpoly0}, {11.85f,vpoly2}, {9.0f, vpoly3}}},
+
+  {-1, {{0, vpoly0}, { 8.0f, vpoly1}, {5.3f, vpoly2}}},
+  {-1, {{0, vpoly0}, { 8.6f, vpoly1}, {5.7f, vpoly2}}},
+  {-1, {{0, vpoly0}, {10.6f, vpoly1}, {6.75f,vpoly2}}},
+  {-1, {{0, vpoly0}, {12.6f, vpoly1}, {8.6f, vpoly2}}},
+  {-1, {{0, vpoly0}, { 9.6f, vpoly2}, {7.6f, vpoly3}}},
+  {-1, {{0, vpoly0}, {11.4f, vpoly2}, {8.65f,vpoly3}}},
+
+#if CORE_TYPE & CORE_SIMD_POLY
+  {10.62f, {{0, vpoly0}, {0, 0}, {0, 0}}},
+  {-1, {{0, vpoly0}, {u100_1_b, vpoly1}, {u100_2_b, vpoly2}}},
+#else
+  {10.62f, {{U100_l, U100_0}, {0, 0}, {0, 0}}},
+  {11.28f, {{u100_l, u100_0}, {u100_1_b, u100_1}, {u100_2_b, u100_2}}},
+#endif
+  {-1, {{0, vpoly0}, {   9, vpoly1}, {  6, vpoly2}}},
+  {-1, {{0, vpoly0}, {  11, vpoly1}, {  7, vpoly2}}},
+  {-1, {{0, vpoly0}, {  13, vpoly1}, {  8, vpoly2}}},
+  {-1, {{0, vpoly0}, {  10, vpoly2}, {  8, vpoly3}}},
+  {-1, {{0, vpoly0}, {  12, vpoly2}, {  9, vpoly3}}},
+};
+
+
+
+static cr_core_t const cr_core = {
+
+#if CORE_TYPE & CORE_SIMD_POLY
+  {SIMD_ALIGNED_MALLOC, SIMD_ALIGNED_CALLOC, SIMD_ALIGNED_FREE},
+#else
+  {malloc, calloc, free},
+#endif
+  half_firs, array_length(half_firs),
+  0, 0,
+  cubic_stage_fn,
+  poly_firs, RDFT_CB
+};
+
+
+
+#if defined SOXR_LIB
+
+#include "soxr.h"
+
+static char const * rate_create(void * channel, void * shared, double io_ratio,
+    soxr_quality_spec_t * q_spec, soxr_runtime_spec_t * r_spec, double scale)
+{
+  return _soxr_init(channel, shared, io_ratio, q_spec, r_spec, scale,
+      &cr_core, CORE_TYPE);
+}
+
+
+
+static char const * id(void) {return CORE_STR;}
+
+fn_t RATE_CB[] = {
+  (fn_t)_soxr_input,
+  (fn_t)_soxr_process,
+  (fn_t)_soxr_output,
+  (fn_t)_soxr_flush,
+  (fn_t)_soxr_close,
+  (fn_t)_soxr_delay,
+  (fn_t)_soxr_sizes,
+  (fn_t)rate_create,
+  (fn_t)0,
+  (fn_t)id,
+};
+
+#endif
diff --git a/src/cr.c b/src/cr.c
new file mode 100644
index 0000000..4122db3
--- /dev/null
+++ b/src/cr.c
@@ -0,0 +1,588 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details.
+ *
+ * Constant-rate resampling common code. */
+
+#include <math.h>
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "filter.h"
+
+#if defined SOXR_LIB
+  #include "internal.h"
+  #define STATIC
+#endif
+
+#include "cr.h"
+
+#define num_coefs4 ((core_flags&CORE_SIMD_POLY)? ((num_coefs+3)&~3) : num_coefs)
+
+#define coef_coef(C,T,x) \
+  C((T*)result, interp_order, num_coefs4, j, x, num_coefs4 - 1 - i)
+
+#define STORE(C,T) { \
+  if (interp_order > 2) coef_coef(C,T,3) = (T)d; \
+  if (interp_order > 1) coef_coef(C,T,2) = (T)c; \
+  if (interp_order > 0) coef_coef(C,T,1) = (T)b; \
+  coef_coef(C,T,0) = (T)f0;}
+
+static real * prepare_poly_fir_coefs(double const * coefs, int num_coefs,
+    int num_phases, int interp_order, double multiplier,
+    core_flags_t core_flags, alloc_t const * mem)
+{
+  int i, j, length = num_coefs4 * num_phases * (interp_order + 1);
+  real * result = mem->calloc(1,(size_t)length << LOG2_SIZEOF_REAL(core_flags));
+  double fm1 = coefs[0], f1 = 0, f2 = 0;
+
+  for (i = num_coefs - 1; i >= 0; --i)
+    for (j = num_phases - 1; j >= 0; --j) {
+      double f0 = fm1, b = 0, c = 0, d = 0; /* = 0 to kill compiler warning */
+      int pos = i * num_phases + j - 1;
+      fm1 = pos > 0 ? coefs[pos - 1] * multiplier : 0;
+      switch (interp_order) {
+        case 1: b = f1 - f0; break;
+        case 2: b = f1 - (.5 * (f2+f0) - f1) - f0; c = .5 * (f2+f0) - f1; break;
+        case 3: c=.5*(f1+fm1)-f0;d=(1/6.)*(f2-f1+fm1-f0-4*c);b=f1-f0-d-c; break;
+        default: assert(!interp_order);
+      }
+      switch (core_flags & 3) {
+        case 0: if (WITH_CR32 ) STORE(coef , float ); break;
+        case 1: if (WITH_CR64 ) STORE(coef , double); break;
+        case 2: if (WITH_CR32S) STORE(coef4, float ); break;
+        default:if (WITH_CR64S) STORE(coef4, double); break;
+      }
+      f2 = f1, f1 = f0;
+    }
+  return result;
+}
+
+#undef STORE
+#undef coef_coef
+
+#define IS_FLOAT32 (WITH_CR32 || WITH_CR32S) && \
+    (!(WITH_CR64 || WITH_CR64S) || sizeof_real == sizeof(float))
+#define WITH_FLOAT64 WITH_CR64 || WITH_CR64S
+
+static void dft_stage_fn(stage_t * p, fifo_t * output_fifo)
+{
+  real * output, * dft_out;
+  int i, j, num_in = max(0, fifo_occupancy(&p->fifo));
+  rate_shared_t const * s = p->shared;
+  dft_filter_t const * f = &s->dft_filter[p->dft_filter_num];
+  int const overlap = f->num_taps - 1;
+
+  if (p->at.integer + p->L * num_in >= f->dft_length) {
+    fn_t const * const RDFT_CB = p->rdft_cb;
+    size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(p->core_flags);
+    div_t divd = div(f->dft_length - overlap - p->at.integer + p->L - 1, p->L);
+    real const * input = fifo_read_ptr(&p->fifo);
+    fifo_read(&p->fifo, divd.quot, NULL);
+    num_in -= divd.quot;
+
+    output = fifo_reserve(output_fifo, f->dft_length);
+    dft_out = (p->core_flags & CORE_SIMD_DFT)? p->dft_out : output;
+
+    if (lsx_is_power_of_2(p->L)) { /* F-domain */
+      int portion = f->dft_length / p->L;
+      memcpy(dft_out, input, (unsigned)portion * sizeof_real);
+      rdft_oforward(portion, f->dft_forward_setup, dft_out, p->dft_scratch);
+      if (IS_FLOAT32) {
+#define dft_out ((float *)dft_out)
+        for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */
+          dft_out[i] = dft_out[(portion << 1) - i],
+            dft_out[i+1] = -dft_out[(portion << 1) - i + 1];
+        dft_out[portion] = dft_out[1];
+        dft_out[portion + 1] = 0;
+        dft_out[1] = dft_out[0];
+#undef dft_out
+      }
+      else if (WITH_FLOAT64) {
+#define dft_out ((double *)dft_out)
+        for (i = portion + 2; i < (portion << 1); i += 2) /* Mirror image. */
+          dft_out[i] = dft_out[(portion << 1) - i],
+            dft_out[i+1] = -dft_out[(portion << 1) - i + 1];
+        dft_out[portion] = dft_out[1];
+        dft_out[portion + 1] = 0;
+        dft_out[1] = dft_out[0];
+#undef dft_out
+      }
+
+      for (portion <<= 1; i < f->dft_length; i += portion, portion <<= 1) {
+        memcpy((char *)dft_out + (size_t)i * sizeof_real, dft_out, (size_t)portion * sizeof_real);
+        if (IS_FLOAT32)
+        #define dft_out ((float *)dft_out)
+          dft_out[i + 1] = 0;
+        #undef dft_out
+        else if (WITH_FLOAT64)
+        #define dft_out ((double *)dft_out)
+          dft_out[i + 1] = 0;
+        #undef dft_out
+      }
+      if (p->step.integer > 0)
+        rdft_reorder_back(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
+    } else {
+      if (p->L == 1)
+        memcpy(dft_out, input, (size_t)f->dft_length * sizeof_real);
+      else {
+        memset(dft_out, 0, (size_t)f->dft_length * sizeof_real);
+        if (IS_FLOAT32)
+          for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L)
+            ((float *)dft_out)[i] = ((float *)input)[j];
+        else if (WITH_FLOAT64)
+          for (j = 0, i = p->at.integer; i < f->dft_length; ++j, i += p->L)
+            ((double *)dft_out)[i] = ((double *)input)[j];
+        p->at.integer = p->L - 1 - divd.rem;
+      }
+      if (p->step.integer > 0)
+        rdft_forward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
+      else
+        rdft_oforward(f->dft_length, f->dft_forward_setup, dft_out, p->dft_scratch);
+    }
+
+    if (p->step.integer > 0) {
+      rdft_convolve(f->dft_length, f->dft_backward_setup, dft_out, f->coefs);
+      rdft_backward(f->dft_length, f->dft_backward_setup, dft_out, p->dft_scratch);
+      if ((p->core_flags & CORE_SIMD_DFT) && p->step.integer == 1)
+        memcpy(output, dft_out, (size_t)f->dft_length * sizeof_real);
+      if (p->step.integer != 1) {
+        if (IS_FLOAT32)
+          for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j,
+              i += p->step.integer)
+            ((float *)output)[j] = ((float *)dft_out)[i];
+        else if (WITH_FLOAT64)
+          for (j = 0, i = p->remM; i < f->dft_length - overlap; ++j,
+              i += p->step.integer)
+            ((double *)output)[j] = ((double *)dft_out)[i];
+        p->remM = i - (f->dft_length - overlap);
+        fifo_trim_by(output_fifo, f->dft_length - j);
+      }
+      else fifo_trim_by(output_fifo, overlap);
+    }
+    else { /* F-domain */
+      int m = -p->step.integer;
+      rdft_convolve_portion(f->dft_length >> m, dft_out, f->coefs);
+      rdft_obackward(f->dft_length >> m, f->dft_backward_setup, dft_out, p->dft_scratch);
+      if (p->core_flags & CORE_SIMD_DFT)
+        memcpy(output, dft_out, (size_t)(f->dft_length >> m) * sizeof_real);
+      fifo_trim_by(output_fifo, (((1 << m) - 1) * f->dft_length + overlap) >>m);
+    }
+    (void)RDFT_CB;
+  }
+  p->input_size = (f->dft_length - p->at.integer + p->L - 1) / p->L;
+}
+
+/* Set to 4 x nearest power of 2 or half of that */
+/* if danger of causing too many cache misses. */
+static int set_dft_length(int num_taps, int min, int large)
+{
+  double d = log((double)num_taps) / log(2.);
+  return 1 << range_limit((int)(d + 2.77), min, max((int)(d + 1.77), large));
+}
+
+static void dft_stage_init(
+    unsigned instance, double Fp, double Fs, double Fn, double att,
+    double phase_response, stage_t * p, int L, int M, double * multiplier,
+    unsigned min_dft_size, unsigned large_dft_size, core_flags_t core_flags,
+    fn_t const * RDFT_CB)
+{
+  dft_filter_t * f = &p->shared->dft_filter[instance];
+  int num_taps = 0, dft_length = f->dft_length, i, offset;
+  bool f_domain_m = abs(3-M) == 1 && Fs <= 1;
+  size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(core_flags);
+
+  if (!dft_length) {
+    int k = phase_response == 50 && lsx_is_power_of_2(L) && Fn == L? L << 1 : 4;
+    double m, * h = lsx_design_lpf(Fp, Fs, Fn, att, &num_taps, -k, -1.);
+
+    if (phase_response != 50)
+      lsx_fir_to_phase(&h, &num_taps, &f->post_peak, phase_response);
+    else f->post_peak = num_taps / 2;
+
+    dft_length = set_dft_length(num_taps, (int)min_dft_size, (int)large_dft_size);
+    f->coefs = rdft_calloc((size_t)dft_length, sizeof_real);
+    offset = dft_length - num_taps + 1;
+    m = (1. / dft_length) * rdft_multiplier() * L * *multiplier;
+    if (IS_FLOAT32) for (i = 0; i < num_taps; ++i)
+        ((float *)f->coefs)[(i + offset) & (dft_length - 1)] =(float)(h[i] * m);
+    else if (WITH_FLOAT64) for (i = 0; i < num_taps; ++i)
+        ((double *)f->coefs)[(i + offset) & (dft_length - 1)] = h[i] * m;
+    free(h);
+  }
+
+  if (rdft_flags() & RDFT_IS_SIMD)
+    p->dft_out = rdft_malloc(sizeof_real * (size_t)dft_length);
+  if (rdft_flags() & RDFT_NEEDS_SCRATCH)
+    p->dft_scratch = rdft_malloc(2 * sizeof_real * (size_t)dft_length);
+
+  if (!f->dft_length) {
+    void * coef_setup = rdft_forward_setup(dft_length);
+    int Lp = lsx_is_power_of_2(L)? L : 1;
+    int Mp = f_domain_m? M : 1;
+    f->dft_forward_setup = rdft_forward_setup(dft_length / Lp);
+    f->dft_backward_setup = rdft_backward_setup(dft_length / Mp);
+    if (Mp == 1)
+      rdft_forward(dft_length, coef_setup, f->coefs, p->dft_scratch);
+    else
+      rdft_oforward(dft_length, coef_setup, f->coefs, p->dft_scratch);
+    rdft_delete_setup(coef_setup);
+    f->num_taps = num_taps;
+    f->dft_length = dft_length;
+    lsx_debug("fir_len=%i dft_length=%i Fp=%g Fs=%g Fn=%g att=%g %i/%i",
+        num_taps, dft_length, Fp, Fs, Fn, att, L, M);
+  }
+  *multiplier = 1;
+  p->out_in_ratio = (double)L / M;
+  p->core_flags = core_flags;
+  p->rdft_cb = RDFT_CB;
+  p->fn = dft_stage_fn;
+  p->preload = f->post_peak / L;
+  p->at.integer = f->post_peak % L;
+  p->L = L;
+  p->step.integer = f_domain_m? -M/2 : M;
+  p->dft_filter_num = instance;
+  p->block_len = f->dft_length - (f->num_taps - 1);
+  p->phase0 = p->at.integer / p->L;
+  p->input_size = (f->dft_length - p->at.integer + p->L - 1) / p->L;
+}
+
+static struct half_fir_info const * find_half_fir(
+    struct half_fir_info const * firs, size_t len, double att)
+{
+  size_t i;
+  for (i = 0; i + 1 < len && att > firs[i].att; ++i);
+  return &firs[i];
+}
+
+#define have_pre_stage  (preM  * preL  != 1)
+#define have_arb_stage  (arbM  * arbL  != 1)
+#define have_post_stage (postM * postL != 1)
+
+#include "soxr.h"
+
+STATIC char const * _soxr_init(
+  rate_t * const p,             /* Per audio channel. */
+  rate_shared_t * const shared, /* By channels undergoing same rate change. */
+  double const io_ratio,        /* Input rate divided by output rate. */
+  soxr_quality_spec_t const * const q_spec,
+  soxr_runtime_spec_t const * const r_spec,
+  double multiplier,            /* Linear gain to apply during conversion. */
+  cr_core_t const * const core,
+  core_flags_t const core_flags)
+{
+  size_t const sizeof_real = sizeof(char) << LOG2_SIZEOF_REAL(core_flags);
+  double const tolerance = 1 + 1e-5;
+
+  double       bits = q_spec->precision;
+  rolloff_t const rolloff = (rolloff_t)(q_spec->flags & 3);
+  int interpolator = (int)(r_spec->flags & 3) - 1;
+  double const Fp0 = q_spec->passband_end, Fs0 = q_spec->stopband_begin;
+  double const phase_response = q_spec->phase_response, tbw0 = Fs0-Fp0;
+
+  bool const maintain_3dB_pt = !!(q_spec->flags & SOXR_MAINTAIN_3DB_PT);
+  double tbw_tighten = 1, alpha;
+  #define tighten(x) (Fs0-(Fs0-(x))*tbw_tighten)
+
+  double arbM = io_ratio, Fn1, Fp1 = Fp0, Fs1 = Fs0, bits1 = min(bits,33);
+  double att = (bits1 + 1) * linear_to_dB(2.), attArb = att; /* +1: pass+stop */
+  int preL = 1, preM = 1, shr = 0, arbL = 1, postL = 1, postM = 1;
+  bool upsample=false, rational=false, iOpt=!(r_spec->flags&SOXR_NOSMALLINTOPT);
+  bool lq_bits= (q_spec->flags & SOXR_PROMOTE_TO_LQ)? bits <= 16 : bits == 16;
+  bool lq_Fp0 = (q_spec->flags & SOXR_PROMOTE_TO_LQ)? Fp0<=lq_bw0 : Fp0==lq_bw0;
+  int n = 0, i, mode = lq_bits && rolloff == rolloff_medium? io_ratio > 1 ||
+    phase_response != 50 || !lq_Fp0 || Fs0 != 1 : ((int)ceil(bits1) - 6) / 4;
+  struct half_fir_info const * half_fir_info;
+  stage_t * s;
+
+  if (io_ratio < 1 && Fs0 - 1 > 1 - Fp0 / tolerance)
+    return "imaging greater than rolloff";
+  if (.002 / tolerance > tbw0 || tbw0 > .5 * tolerance)
+    return "transition bandwidth not in [0.2,50] % of nyquist";
+  if (.5 / tolerance > Fp0 || Fs0 > 1.5 * tolerance)
+    return "transition band not within [50,150] % of nyquist";
+  if (bits!=0 && (15 > bits || bits > 33))
+    return "precision not in [15,33] bits";
+  if (io_ratio <= 0)
+    return "resampling factor not positive";
+  if (0 > phase_response || phase_response > 100)
+    return "phase response not in [0=min-phase,100=max-phase] %";
+
+  p->core = core;
+  p->io_ratio = io_ratio;
+  if (bits!=0) while (!n++) {                            /* Determine stages: */
+    int try, L, M, x, maxL = interpolator > 0? 1 : mode? 2048 :
+      (int)ceil(r_spec->coef_size_kbytes * 1000. / (U100_l * (int)sizeof_real));
+    double d, epsilon = 0, frac;
+    upsample = arbM < 1;
+    for (i = (int)(.5 * arbM), shr = 0; i >>= 1; arbM *= .5, ++shr);
+    preM = upsample || (arbM > 1.5 && arbM < 2);
+    postM = 1 + (arbM > 1 && preM), arbM /= postM;
+    preL = 1 + (!preM && arbM < 2) + (upsample && mode), arbM *= preL;
+    if ((frac = arbM - (int)arbM)!=0)
+      epsilon = fabs(floor(frac * MULT32 + .5) / (frac * MULT32) - 1);
+    for (i = 1, rational = frac==0; i <= maxL && !rational; ++i) {
+      d = frac * i, try = (int)(d + .5);
+      if ((rational = fabs(try / d - 1) <= epsilon)) {    /* No long doubles! */
+        if (try == i)
+          arbM = ceil(arbM), shr += x = arbM > 3, arbM /= 1 + x;
+        else arbM = i * (int)arbM + try, arbL = i;
+      }
+    }
+    L = preL * arbL, M = (int)(arbM * postM), x = (L|M)&1, L >>= !x, M >>= !x;
+    if (iOpt && postL == 1 && (d = preL * arbL / arbM) > 4 && d != 5) {
+      for (postL = 4, i = (int)(d / 16); (i >>= 1) && postL < 256; postL <<= 1);
+      arbM = arbM * postL / arbL / preL, arbL = 1, n = 0;
+    } else if (rational && (max(L, M) < 3 + 2 * iOpt || L * M < 6 * iOpt))
+      preL = L, preM = M, arbM = arbL = postM = 1;
+    if (!mode && (!rational || !n))
+      ++mode, n = 0;
+  }
+
+  p->num_stages = shr + have_pre_stage + have_arb_stage + have_post_stage;
+  if (!p->num_stages && multiplier != 1) {
+    bits = arbL = 0;                         /* Use cubic_stage in this case. */
+    ++p->num_stages;
+  }
+  p->stages = calloc((size_t)p->num_stages + 1, sizeof(*p->stages));
+  if (!p->stages)
+    return "out of memory";
+  for (i = 0; i < p->num_stages; ++i) {
+    p->stages[i].num = i;
+    p->stages[i].shared = shared;
+    p->stages[i].input_size = 8192;
+  }
+  p->stages[0].is_input = true;
+
+  alpha = postM / (io_ratio * (postL << 0));
+
+  if ((n = p->num_stages) > 1) {                              /* Att. budget: */
+    if (have_arb_stage)
+      att += linear_to_dB(2.), attArb = att, --n;
+    att += linear_to_dB((double)n);
+  }
+
+  half_fir_info = find_half_fir(core->half_firs, core->half_firs_len, att);
+  for (i = 0, s = p->stages; i < shr; ++i, ++s) {
+    s->fn = half_fir_info->fn;
+    s->coefs = half_fir_info->coefs;
+    s->n = half_fir_info->num_coefs;
+    s->pre_post = 4 * s->n;
+    s->preload = s->pre = s->pre_post >> 1;
+  }
+
+  if (have_pre_stage) {
+    if (maintain_3dB_pt && have_post_stage) {    /* Trans. bands overlapping. */
+      double x = tbw0 * lsx_inv_f_resp(-3., att);
+      x = -lsx_f_resp(x / (max(2 * alpha - Fs0, alpha) - Fp0), att);
+      if (x > .035) {
+        tbw_tighten = ((4.3074e-3 - 3.9121e-4 * x) * x - .040009) * x + 1.0014;
+        lsx_debug("tbw_tighten=%g (%gdB)", tbw_tighten, x);
+      }
+    }
+    Fn1 = preM? max(preL, preM) : arbM / arbL;
+    dft_stage_init(0, tighten(Fp1), Fs1, Fn1, att, phase_response, s++, preL,
+        max(preM, 1), &multiplier, r_spec->log2_min_dft_size,
+        r_spec->log2_large_dft_size, core_flags, core->rdft_cb);
+    Fp1 /= Fn1, Fs1 /= Fn1;
+  }
+
+  if (bits==0 && have_arb_stage) {                /* `Quick' cubic arb stage: */
+    s->fn = core->cubic_stage_fn;
+    s->mult = multiplier, multiplier = 1;
+    s->step.whole = (int64_t)(arbM * MULT32 + .5);
+    s->pre_post = max(3, s->step.integer);
+    s->preload = s->pre = 1;
+    s->out_in_ratio = MULT32 / (double)s->step.whole;
+  }
+  else if (have_arb_stage) {                     /* Higher quality arb stage: */
+    static const float rolloffs[] = {-.01f, -.3f, 0, -.103f};
+    poly_fir_t const * f = &core->poly_firs[6*(upsample+!!preM)+mode-!upsample];
+    int order, num_coefs = (int)f->interp[0].scalar, phase_bits, phases;
+    size_t coefs_size;
+    double at, Fp = Fp1, Fs, Fn, mult = upsample? 1 : arbM / arbL;
+    poly_fir1_t const * f1;
+
+    if (!upsample && preM)
+      Fn = 2 * mult, Fs = 3 + fabs(Fs1 - 1);
+    else Fn = 1, Fs = 2 - (mode? Fp1 + (Fs1 - Fp1) * .7 : Fs1);
+
+    if (mode)
+      Fp = Fs - (Fs - Fp) / (1 - lsx_inv_f_resp(rolloffs[rolloff], attArb));
+
+    i = (interpolator < 0? !rational : max(interpolator, !rational)) - 1;
+    do {
+      f1 = &f->interp[++i];
+      assert(f1->fn);
+      if (i)
+        arbM /= arbL, arbL = 1, rational = false;
+      phase_bits = (int)ceil(f1->scalar - log(mult)/log(2.));
+      phases = !rational? (1 << phase_bits) : arbL;
+      if (f->interp[0].scalar==0) {
+        int phases0 = max(phases, 19), n0 = 0;
+        lsx_design_lpf(Fp, Fs, -Fn, attArb, &n0, phases0, f->beta);
+        num_coefs = n0 / phases0 + 1, num_coefs += num_coefs & !preM;
+      }
+      if ((num_coefs & 1) && rational && (arbL & 1))
+        phases <<= 1, arbL <<= 1, arbM *= 2;
+      at = arbL * (s->phase0 = .5 * (num_coefs & 1));
+      order = i + (i && mode > 4);
+      coefs_size = (size_t)(num_coefs4 * phases * (order+1)) * sizeof_real;
+    } while (interpolator < 0 && i < 2 && f->interp[i+1].fn &&
+        coefs_size / 1000 > r_spec->coef_size_kbytes);
+
+    if (!s->shared->poly_fir_coefs) {
+      int num_taps = num_coefs * phases - 1;
+      double * coefs = lsx_design_lpf(
+          Fp, Fs, Fn, attArb, &num_taps, phases, f->beta);
+      s->shared->poly_fir_coefs = prepare_poly_fir_coefs(
+          coefs, num_coefs, phases, order, multiplier, core_flags, &core->mem);
+      lsx_debug("fir_len=%i phases=%i coef_interp=%i size=%.3gk",
+          num_coefs, phases, order, (double)coefs_size / 1000.);
+      free(coefs);
+    }
+    multiplier = 1;
+    s->fn = f1->fn;
+    s->pre_post = num_coefs4 - 1;
+    s->preload = ((num_coefs - 1) >> 1) + (num_coefs4 - num_coefs);
+    s->n = num_coefs4;
+    s->phase_bits = phase_bits;
+    s->L = arbL;
+    s->use_hi_prec_clock =
+      mode>1 && (q_spec->flags & SOXR_HI_PREC_CLOCK) && !rational;
+#if WITH_FLOAT_STD_PREC_CLOCK
+    if (order && !s->use_hi_prec_clock) {
+      s->at.flt = at;
+      s->step.flt = arbM;
+      s->out_in_ratio = (double)(arbL / s->step.flt);
+    } else
+#endif
+    {
+      s->at.whole = (int64_t)(at * MULT32 + .5);
+#if WITH_HI_PREC_CLOCK
+      if (s->use_hi_prec_clock) {
+        double M = arbM * MULT32;
+        s->at.fix.ls.parts.ms = 0x80000000ul;
+        s->step.whole = (int64_t)M;
+        M -= (double)s->step.whole;
+        M *= MULT32 * MULT32;
+        s->step.fix.ls.all = (uint64_t)M;
+      } else
+#endif
+        s->step.whole = (int64_t)(arbM * MULT32 + .5);
+      s->out_in_ratio = MULT32 * arbL / (double)s->step.whole;
+    }
+    ++s;
+  }
+
+  if (have_post_stage)
+    dft_stage_init(1, tighten(Fp0 / (upsample? alpha : 1)), upsample? max(2 -
+        Fs0 / alpha, 1) : Fs0, (double)max(postL, postM), att, phase_response,
+        s++, postL, postM, &multiplier, r_spec->log2_min_dft_size,
+        r_spec->log2_large_dft_size, core_flags, core->rdft_cb);
+
+  lsx_debug("%g: >>%i %i/%i %i/%g %i/%i (%x)", 1/io_ratio,
+      shr, preL, preM, arbL, arbM, postL, postM, core_flags);
+
+  for (i = 0, s = p->stages; i < p->num_stages; ++i, ++s) {
+    fifo_create(&s->fifo, (int)sizeof_real);
+    memset(fifo_reserve(&s->fifo, s->preload), 0,
+        sizeof_real * (size_t)s->preload);
+    lsx_debug_more("%5i|%-5i preload=%i remL=%i",
+        s->pre, s->pre_post-s->pre, s->preload, s->at.integer);
+  }
+  fifo_create(&s->fifo, (int)sizeof_real);
+  return 0;
+}
+
+static bool stage_process(stage_t * stage, bool flushing)
+{
+  fifo_t * fifo = &stage->fifo;
+  bool done = false;
+  int want;
+  while (!done && (want = stage->input_size - fifo_occupancy(fifo)) > 0) {
+    if (stage->is_input) {
+      if (flushing)
+        memset(fifo_reserve(fifo, want), 0, fifo->item_size * (size_t)want);
+      else done = true;
+    }
+    else done = stage_process(stage - 1, flushing);
+  }
+  stage->fn(stage, &stage[1].fifo);
+  return done && fifo_occupancy(fifo) < stage->input_size;
+}
+
+STATIC void _soxr_process(rate_t * p, size_t olen)
+{
+  int const n = p->flushing? min(-(int)p->samples_out, (int)olen) : (int)olen;
+  stage_t * stage = &p->stages[p->num_stages];
+  fifo_t * fifo = &stage->fifo;
+  bool done = false;
+  while (!done && fifo_occupancy(fifo) < (int)n)
+    done = stage->is_input || stage_process(stage - 1, p->flushing);
+}
+
+STATIC real * _soxr_input(rate_t * p, real const * samples, size_t n)
+{
+  if (p->flushing)
+    return 0;
+  p->samples_in += (int64_t)n;
+  return fifo_write(&p->stages[0].fifo, (int)n, samples);
+}
+
+STATIC real const * _soxr_output(rate_t * p, real * samples, size_t * n0)
+{
+  fifo_t * fifo = &p->stages[p->num_stages].fifo;
+  int n = p->flushing? min(-(int)p->samples_out, (int)*n0) : (int)*n0;
+  p->samples_out += n = min(n, fifo_occupancy(fifo));
+  return fifo_read(fifo, (int)(*n0 = (size_t)n), samples);
+}
+
+STATIC void _soxr_flush(rate_t * p)
+{
+  if (p->flushing) return;
+  p->samples_out -= (int64_t)((double)p->samples_in / p->io_ratio + .5);
+  p->samples_in = 0;
+  p->flushing = true;
+}
+
+STATIC void _soxr_close(rate_t * p)
+{
+  if (p->stages) {
+    fn_t const * const RDFT_CB = p->core->rdft_cb;
+    rate_shared_t * shared = p->stages[0].shared;
+    int i;
+
+    for (i = 0; i <= p->num_stages; ++i) {
+      stage_t * s = &p->stages[i];
+      rdft_free(s->dft_scratch);
+      rdft_free(s->dft_out);
+      fifo_delete(&s->fifo);
+    }
+    if (shared) {
+      for (i = 0; i < 2; ++i) {
+        dft_filter_t * f= &shared->dft_filter[i];
+        rdft_free(f->coefs);
+        rdft_delete_setup(f->dft_forward_setup);
+        rdft_delete_setup(f->dft_backward_setup);
+      }
+      p->core->mem.free(shared->poly_fir_coefs);
+      memset(shared, 0, sizeof(*shared));
+    }
+    free(p->stages);
+    (void)RDFT_CB;
+  }
+}
+
+#if defined SOXR_LIB
+STATIC double _soxr_delay(rate_t * p)
+{
+  return (double)p->samples_in / p->io_ratio - (double)p->samples_out;
+}
+
+STATIC void _soxr_sizes(size_t * shared, size_t * channel)
+{
+  *shared = sizeof(rate_shared_t);
+  *channel = sizeof(rate_t);
+}
+#endif
diff --git a/src/cr.h b/src/cr.h
new file mode 100644
index 0000000..d6e8637
--- /dev/null
+++ b/src/cr.h
@@ -0,0 +1,178 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_cr_included
+#define soxr_cr_included
+
+#define  FIFO_SIZE_T int
+#include "fifo.h"
+
+typedef void real; /* float or double */
+struct stage;
+typedef void (* stage_fn_t)(struct stage * input, fifo_t * output);
+typedef struct half_fir_info {
+  int num_coefs;
+  real const * coefs;
+  stage_fn_t fn, dfn;
+  float att;
+} half_fir_info_t;
+typedef struct {float scalar; stage_fn_t fn;} poly_fir1_t;
+typedef struct {float beta; poly_fir1_t interp[3];} poly_fir_t;
+
+#define U100_l 42
+#define MULT32 (65536. * 65536.)
+
+/* Conceptually: coef_p is &coefs[num_phases][fir_len][interp_order+1]: */
+#define coef(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) (coef_p)[\
+  (fir_len) * ((interp_order) + 1) * (phase_num) + \
+  ((interp_order) + 1) * (fir_coef_num) + \
+  ((interp_order) - (coef_interp_num))]
+
+/* Conceptually: coef_p is &coefs[num_phases][fir_len/4][interp_order+1][4]: */
+#define coef4(coef_p, interp_order, fir_len, phase_num, coef_interp_num, fir_coef_num) (coef_p)[\
+  (fir_len) * ((interp_order) + 1) * (phase_num) + \
+  ((interp_order) + 1) * ((fir_coef_num) & ~3) + \
+  4 * ((interp_order) - (coef_interp_num)) + \
+  ((fir_coef_num) & 3)]
+
+typedef union { /* Int64 in parts */
+  #if HAVE_BIGENDIAN
+  struct {int32_t ms; uint32_t ls;} parts;
+  #else
+  struct {uint32_t ls; int32_t ms;} parts;
+  #endif
+  int64_t all;
+} int64p_t;
+
+typedef union { /* Uint64 in parts */
+  #if HAVE_BIGENDIAN
+  struct {uint32_t ms, ls;} parts;
+  #else
+  struct {uint32_t ls, ms;} parts;
+  #endif
+  uint64_t all;
+} uint64p_t;
+
+typedef struct {
+  int        dft_length, num_taps, post_peak;
+  void       * dft_forward_setup, * dft_backward_setup;
+  real   * coefs;
+} dft_filter_t;
+
+typedef struct { /* So generated filter coefs may be shared between channels */
+  real   * poly_fir_coefs;
+  dft_filter_t dft_filter[2];
+} rate_shared_t;
+
+typedef double float_step_t; /* Or long double or __float128. */
+
+typedef union { /* Fixed point arithmetic */
+  struct {uint64p_t ls; int64p_t ms;} fix;  /* Hi-prec has ~96 bits. */
+  float_step_t flt;
+} step_t;
+
+#define integer  fix.ms.parts.ms
+#define fraction fix.ms.parts.ls
+#define whole    fix.ms.all
+
+#define CORE_DBL       1
+#define CORE_SIMD_POLY 2
+#define CORE_SIMD_HALF 4
+#define CORE_SIMD_DFT  8
+#define LOG2_SIZEOF_REAL(core_flags) (2 + ((core_flags) & 1))
+
+typedef int core_flags_t;
+
+#if defined SOXR_LIB
+#include "rdft_t.h"
+#else
+typedef void fn_t;
+#endif
+
+typedef struct stage {
+  int        num;
+
+  /* Common to all stage types: */
+  core_flags_t   core_flags;
+  stage_fn_t fn;
+  fifo_t     fifo;
+  int        pre;       /* Number of past samples to store */
+  int        pre_post;  /* pre + number of future samples to store */
+  int        preload;   /* Number of zero samples to pre-load the fifo */
+  double     out_in_ratio; /* For buffer management. */
+  int        input_size;
+  bool       is_input;
+
+  /* For a stage with variable (run-time generated) filter coefs: */
+  fn_t const * rdft_cb;
+  rate_shared_t * shared;
+  unsigned   dft_filter_num; /* Which, if any, of the 2 DFT filters to use */
+  real       * dft_scratch;
+  float      * dft_out;
+  real const * coefs;
+
+  /* For a stage with variable L/M: */
+  step_t     at, step;
+  bool       use_hi_prec_clock;
+  int        L, remM;
+  int        n, phase_bits, block_len;
+  double     mult, phase0;
+} stage_t;
+
+#define stage_occupancy(s) max(0, fifo_occupancy(&(s)->fifo) - (s)->pre_post)
+#define stage_read_p(s) ((sample_t *)fifo_read_ptr(&(s)->fifo) + (s)->pre)
+
+#define lq_bw0  (1385/2048.) /* ~.67625, FP exact. */
+
+typedef enum {rolloff_small, rolloff_medium, rolloff_none} rolloff_t;
+
+typedef struct {
+  void * (* alloc)(size_t);
+  void * (* calloc)(size_t, size_t);
+  void (* free)(void *);
+} alloc_t;
+
+typedef struct {
+  alloc_t mem;
+  half_fir_info_t  const * half_firs;
+  size_t half_firs_len;
+  half_fir_info_t  const * doub_firs;
+  size_t doub_firs_len;
+  stage_fn_t cubic_stage_fn;
+  poly_fir_t const * poly_firs;
+  fn_t * rdft_cb;
+} cr_core_t;
+
+typedef struct rate rate_t;
+struct rate {
+  cr_core_t const * core;
+  double     io_ratio;
+  int64_t    samples_in, samples_out;
+  int        num_stages, flushing;
+  stage_t    * stages;
+};
+
+#if defined SOXR_LIB
+
+#include "soxr.h"
+
+char const * _soxr_init(
+  rate_t * const p,                /* Per audio channel.                            */
+  rate_shared_t * const shared,    /* Between channels (undergoing same rate change)*/
+  double const io_ratio,           /* Input rate divided by output rate.            */
+  soxr_quality_spec_t const * const q_spec,
+  soxr_runtime_spec_t const * const r_spec,
+  double multiplier,               /* Linear gain to apply during conversion.   1   */
+  cr_core_t const * const core,
+  core_flags_t const);
+
+void _soxr_process(struct rate * p, size_t olen);
+real * _soxr_input(struct rate * p, real const * samples, size_t n);
+real const * _soxr_output(struct rate * p, real * samples, size_t * n0);
+void _soxr_flush(struct rate * p);
+void _soxr_close(struct rate * p);
+double _soxr_delay(struct rate * p);
+void _soxr_sizes(size_t * shared, size_t * channel);
+#endif
+
+#endif
diff --git a/src/cr32.c b/src/cr32.c
new file mode 100644
index 0000000..b9eb264
--- /dev/null
+++ b/src/cr32.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define RATE_CB    _soxr_rate32_cb
+#define CORE_STR   "cr32"
+
+#define CORE_TYPE  0
+#include "cr-core.c"
diff --git a/src/cr32s.c b/src/cr32s.c
new file mode 100644
index 0000000..5de2a43
--- /dev/null
+++ b/src/cr32s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define RATE_CB    _soxr_rate32s_cb
+#define CORE_STR   "cr32s"
+
+#define CORE_TYPE  (CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT)
+#include "cr-core.c"
diff --git a/src/cr64.c b/src/cr64.c
new file mode 100644
index 0000000..518cdd7
--- /dev/null
+++ b/src/cr64.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define RATE_CB    _soxr_rate64_cb
+#define CORE_STR   "cr64"
+
+#define CORE_TYPE  CORE_DBL
+#include "cr-core.c"
diff --git a/src/cr64s.c b/src/cr64s.c
new file mode 100644
index 0000000..5dcd6f1
--- /dev/null
+++ b/src/cr64s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define RATE_CB    _soxr_rate64s_cb
+#define CORE_STR   "cr64s"
+
+#define CORE_TYPE  (CORE_DBL|CORE_SIMD_POLY|CORE_SIMD_HALF|CORE_SIMD_DFT)
+#include "cr-core.c"
diff --git a/src/data-io.c b/src/data-io.c
new file mode 100644
index 0000000..fb61675
--- /dev/null
+++ b/src/data-io.c
@@ -0,0 +1,223 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <limits.h>
+#include <math.h>
+#include <string.h>
+
+#include "data-io.h"
+#include "internal.h"
+
+
+
+#define DEINTERLEAVE_FROM(T,flag) do { \
+  unsigned i; \
+  size_t j; \
+  T const * src = *src0; \
+  if (ch > 1) for (j = 0; j < n; ++j) \
+    for (i = 0; i < ch; ++i) dest[i][j] = (DEINTERLEAVE_TO)*src++; \
+  else if (flag) memcpy(dest[0], src, n * sizeof(T)), src = &src[n]; \
+  else for (j = 0; j < n; dest[0][j++] = (DEINTERLEAVE_TO)*src++); \
+  *src0 = src; \
+} while (0)
+
+
+
+#if WITH_CR64 || WITH_CR64S
+void _soxr_deinterleave(double * * dest, /* Round/clipping not needed here */
+    soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch)
+{
+#define DEINTERLEAVE_TO double
+  switch (data_type & 3) {
+    case SOXR_FLOAT32: DEINTERLEAVE_FROM(float, 0); break;
+    case SOXR_FLOAT64: DEINTERLEAVE_FROM(double, 1); break;
+    case SOXR_INT32:   DEINTERLEAVE_FROM(int32_t, 0); break;
+    case SOXR_INT16:   DEINTERLEAVE_FROM(int16_t, 0); break;
+    default: break;
+  }
+}
+#endif
+
+
+
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
+void _soxr_deinterleave_f(float * * dest, /* Round/clipping not needed here */
+    soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch)
+{
+#undef DEINTERLEAVE_TO
+#define DEINTERLEAVE_TO float
+  switch (data_type & 3) {
+    case SOXR_FLOAT32: DEINTERLEAVE_FROM(float, 1); break;
+    case SOXR_FLOAT64: DEINTERLEAVE_FROM(double, 0); break;
+    case SOXR_INT32:   DEINTERLEAVE_FROM(int32_t, 0); break;
+    case SOXR_INT16:   DEINTERLEAVE_FROM(int16_t, 0); break;
+    default: break;
+  }
+}
+#endif
+
+
+
+#include "rint.h"
+
+
+
+#if defined FE_INVALID && defined FPU_RINT32 && defined __STDC_VERSION__
+  #if __STDC_VERSION__ >= 199901L
+    #pragma STDC FENV_ACCESS ON
+  #endif
+#endif
+
+#if WITH_CR64 || WITH_CR64S
+#define FLOATX double
+
+#define LSX_RINT_CLIP_2 lsx_rint32_clip_2
+#define LSX_RINT_CLIP lsx_rint32_clip
+#define RINT_CLIP rint32_clip
+#define RINT rint32D
+#if defined FPU_RINT32
+  #define FPU_RINT
+#endif
+#define RINT_T int32_t
+#define RINT_MAX 2147483647L
+#include "rint-clip.h"
+
+#define LSX_RINT_CLIP_2 lsx_rint16_clip_2
+#define LSX_RINT_CLIP lsx_rint16_clip
+#define RINT_CLIP rint16_clip
+#define RINT rint16D
+#if defined FPU_RINT16
+  #define FPU_RINT
+#endif
+#define RINT_T int16_t
+#define RINT_MAX 32767
+#include "rint-clip.h"
+
+#define LSX_RINT_CLIP_2 lsx_rint16_clip_2_dither
+#define LSX_RINT_CLIP lsx_rint16_clip_dither
+#define RINT_CLIP rint16_clip_dither
+#define RINT rint16D
+#if defined FPU_RINT16
+  #define FPU_RINT
+#endif
+#define RINT_T int16_t
+#define RINT_MAX 32767
+#define DITHER
+#include "rint-clip.h"
+
+#undef FLOATX
+#endif
+
+
+
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
+#define FLOATX float
+
+#define LSX_RINT_CLIP_2 lsx_rint32_clip_2_f
+#define LSX_RINT_CLIP lsx_rint32_clip_f
+#define RINT_CLIP rint32_clip_f
+#define RINT rint32F
+#if defined FPU_RINT32
+  #define FPU_RINT
+#endif
+#define RINT_T int32_t
+#define RINT_MAX 2147483647L
+#include "rint-clip.h"
+
+#define LSX_RINT_CLIP_2 lsx_rint16_clip_2_f
+#define LSX_RINT_CLIP lsx_rint16_clip_f
+#define RINT_CLIP rint16_clip_f
+#define RINT rint16F
+#if defined FPU_RINT16
+  #define FPU_RINT
+#endif
+#define RINT_T int16_t
+#define RINT_MAX 32767
+#include "rint-clip.h"
+
+#define LSX_RINT_CLIP_2 lsx_rint16_clip_2_dither_f
+#define LSX_RINT_CLIP lsx_rint16_clip_dither_f
+#define RINT_CLIP rint16_clip_dither_f
+#define RINT rint16D
+#if defined FPU_RINT16
+  #define FPU_RINT
+#endif
+#define RINT_T int16_t
+#define RINT_MAX 32767
+#define DITHER
+#include "rint-clip.h"
+
+#undef FLOATX
+#endif
+
+#if defined FE_INVALID && defined FPU_RINT32 && defined __STDC_VERSION__
+  #if __STDC_VERSION__ >= 199901L
+    #pragma STDC FENV_ACCESS OFF
+  #endif
+#endif
+
+
+
+#define INTERLEAVE_TO(T,flag) do { \
+  unsigned i; \
+  size_t j; \
+  T * dest = *dest0; \
+  if (ch > 1) \
+  for (j = 0; j < n; ++j) for (i = 0; i < ch; ++i) *dest++ = (T)src[i][j]; \
+  else if (flag) memcpy(dest, src[0], n * sizeof(T)), dest = &dest[n]; \
+  else for (j = 0; j < n; *dest++ = (T)src[0][j++]); \
+  *dest0 = dest; \
+  return 0; \
+} while (0)
+
+#if WITH_CR64 || WITH_CR64S
+size_t /* clips */ _soxr_interleave(soxr_datatype_t data_type, void * * dest0,
+  double const * const * src, size_t n, unsigned ch, unsigned long * seed)
+{
+  switch (data_type & 3) {
+    case SOXR_FLOAT32: INTERLEAVE_TO(float, 0);
+    case SOXR_FLOAT64: INTERLEAVE_TO(double, 1);
+
+    case SOXR_INT32: if (ch == 1)
+        return lsx_rint32_clip(dest0, src[0], n);
+      return lsx_rint32_clip_2(dest0, src, ch, n);
+
+    case SOXR_INT16: if (seed) {
+      if (ch == 1)
+        return lsx_rint16_clip_dither(dest0, src[0], n, seed);
+      return lsx_rint16_clip_2_dither(dest0, src, ch, n, seed);
+    }
+    if (ch == 1)
+        return lsx_rint16_clip(dest0, src[0], n);
+      return lsx_rint16_clip_2(dest0, src, ch, n);
+    default: break;
+  }
+  return 0;
+}
+#endif
+
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
+size_t /* clips */ _soxr_interleave_f(soxr_datatype_t data_type, void * * dest0,
+  float const * const * src, size_t n, unsigned ch, unsigned long * seed)
+{
+  switch (data_type & 3) {
+    case SOXR_FLOAT32: INTERLEAVE_TO(float, 1);
+    case SOXR_FLOAT64: INTERLEAVE_TO(double, 0);
+
+    case SOXR_INT32: if (ch == 1)
+        return lsx_rint32_clip_f(dest0, src[0], n);
+      return lsx_rint32_clip_2_f(dest0, src, ch, n);
+
+    case SOXR_INT16: if (seed) {
+      if (ch == 1)
+        return lsx_rint16_clip_dither_f(dest0, src[0], n, seed);
+      return lsx_rint16_clip_2_dither_f(dest0, src, ch, n, seed);
+    }
+    if (ch == 1)
+        return lsx_rint16_clip_f(dest0, src[0], n);
+      return lsx_rint16_clip_2_f(dest0, src, ch, n);
+    default: break;
+  }
+  return 0;
+}
+#endif
diff --git a/src/data-io.h b/src/data-io.h
new file mode 100644
index 0000000..83a0a13
--- /dev/null
+++ b/src/data-io.h
@@ -0,0 +1,39 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_data_io_included
+#define soxr_data_io_included
+
+#include "soxr.h"
+
+void _soxr_deinterleave(
+    double * * dest,
+    soxr_datatype_t data_type,
+    void const * * src0,
+    size_t n,
+    unsigned ch);
+
+void _soxr_deinterleave_f(
+    float * * dest,
+    soxr_datatype_t data_type,
+    void const * * src0,
+    size_t n,
+    unsigned ch);
+
+size_t /* clips */ _soxr_interleave(
+    soxr_datatype_t data_type,
+    void * * dest,
+    double const * const * src,
+    size_t n,
+    unsigned ch,
+    unsigned long * seed);
+
+size_t /* clips */ _soxr_interleave_f(
+    soxr_datatype_t data_type,
+    void * * dest,
+    float const * const * src,
+    size_t n,
+    unsigned ch,
+    unsigned long * seed);
+
+#endif
diff --git a/src/dbesi0.c b/src/dbesi0.c
new file mode 100644
index 0000000..654216e
--- /dev/null
+++ b/src/dbesi0.c
@@ -0,0 +1,149 @@
+/*  Copyright(C) 1996 Takuya OOURA
+
+You may use, copy, modify this code for any purpose and
+without fee.
+
+Package home:  http://www.kurims.kyoto-u.ac.jp/~ooura/bessel.html
+*/
+
+#include "filter.h"
+#define dbesi0 lsx_bessel_I_0
+
+/* Bessel I_0(x) function in double precision */
+
+#include <math.h>
+
+double dbesi0(double x)
+{
+    int k;
+    double w, t, y;
+    static double a[65] = {
+        8.5246820682016865877e-11, 2.5966600546497407288e-9,
+        7.9689994568640180274e-8, 1.9906710409667748239e-6,
+        4.0312469446528002532e-5, 6.4499871606224265421e-4,
+        0.0079012345761930579108, 0.071111111109207045212,
+        0.444444444444724909, 1.7777777777777532045,
+        4.0000000000000011182, 3.99999999999999998,
+        1.0000000000000000001,
+        1.1520919130377195927e-10, 2.2287613013610985225e-9,
+        8.1903951930694585113e-8, 1.9821560631611544984e-6,
+        4.0335461940910133184e-5, 6.4495330974432203401e-4,
+        0.0079013012611467520626, 0.071111038160875566622,
+        0.44444450319062699316, 1.7777777439146450067,
+        4.0000000132337935071, 3.9999999968569015366,
+        1.0000000003426703174,
+        1.5476870780515238488e-10, 1.2685004214732975355e-9,
+        9.2776861851114223267e-8, 1.9063070109379044378e-6,
+        4.0698004389917945832e-5, 6.4370447244298070713e-4,
+        0.0079044749458444976958, 0.071105052411749363882,
+        0.44445280640924755082, 1.7777694934432109713,
+        4.0000055808824003386, 3.9999977081165740932,
+        1.0000004333949319118,
+        2.0675200625006793075e-10, -6.1689554705125681442e-10,
+        1.2436765915401571654e-7, 1.5830429403520613423e-6,
+        4.2947227560776583326e-5, 6.3249861665073441312e-4,
+        0.0079454472840953930811, 0.070994327785661860575,
+        0.44467219586283000332, 1.7774588182255374745,
+        4.0003038986252717972, 3.9998233869142057195,
+        1.0000472932961288324,
+        2.7475684794982708655e-10, -3.8991472076521332023e-9,
+        1.9730170483976049388e-7, 5.9651531561967674521e-7,
+        5.1992971474748995357e-5, 5.7327338675433770752e-4,
+        0.0082293143836530412024, 0.069990934858728039037,
+        0.44726764292723985087, 1.7726685170014087784,
+        4.0062907863712704432, 3.9952750700487845355,
+        1.0016354346654179322
+    };
+    static double b[70] = {
+        6.7852367144945531383e-8, 4.6266061382821826854e-7,
+        6.9703135812354071774e-6, 7.6637663462953234134e-5,
+        7.9113515222612691636e-4, 0.0073401204731103808981,
+        0.060677114958668837046, 0.43994941411651569622,
+        2.7420017097661750609, 14.289661921740860534,
+        59.820609640320710779, 188.78998681199150629,
+        399.8731367825601118, 427.56411572180478514,
+        1.8042097874891098754e-7, 1.2277164312044637357e-6,
+        1.8484393221474274861e-5, 2.0293995900091309208e-4,
+        0.0020918539850246207459, 0.019375315654033949297,
+        0.15985869016767185908, 1.1565260527420641724,
+        7.1896341224206072113, 37.354773811947484532,
+        155.80993164266268457, 489.5211371158540918,
+        1030.9147225169564806, 1093.5883545113746958,
+        4.8017305613187493564e-7, 3.261317843912380074e-6,
+        4.9073137508166159639e-5, 5.3806506676487583755e-4,
+        0.0055387918291051866561, 0.051223717488786549025,
+        0.42190298621367914765, 3.0463625987357355872,
+        18.895299447327733204, 97.915189029455461554,
+        407.13940115493494659, 1274.3088990480582632,
+        2670.9883037012547506, 2815.7166284662544712,
+        1.2789926338424623394e-6, 8.6718263067604918916e-6,
+        1.3041508821299929489e-4, 0.001428224737372747892,
+        0.014684070635768789378, 0.13561403190404185755,
+        1.1152592585977393953, 8.0387088559465389038,
+        49.761318895895479206, 257.2684232313529138,
+        1066.8543146269566231, 3328.3874581009636362,
+        6948.8586598121634874, 7288.4893398212481055,
+        3.409350368197032893e-6, 2.3079025203103376076e-5,
+        3.4691373283901830239e-4, 0.003794994977222908545,
+        0.038974209677945602145, 0.3594948380414878371,
+        2.9522878893539528226, 21.246564609514287056,
+        131.28727387146173141, 677.38107093296675421,
+        2802.3724744545046518, 8718.5731420798254081,
+        18141.348781638832286, 18948.925349296308859
+    };
+    static double c[45] = {
+        2.5568678676452702768e-15, 3.0393953792305924324e-14,
+        6.3343751991094840009e-13, 1.5041298011833009649e-11,
+        4.4569436918556541414e-10, 1.746393051427167951e-8,
+        1.0059224011079852317e-6, 1.0729838945088577089e-4,
+        0.05150322693642527738,
+        5.2527963991711562216e-15, 7.202118481421005641e-15,
+        7.2561421229904797156e-13, 1.482312146673104251e-11,
+        4.4602670450376245434e-10, 1.7463600061788679671e-8,
+        1.005922609132234756e-6, 1.0729838937545111487e-4,
+        0.051503226936437300716,
+        1.3365917359358069908e-14, -1.2932643065888544835e-13,
+        1.7450199447905602915e-12, 1.0419051209056979788e-11,
+        4.58047881980598326e-10, 1.7442405450073548966e-8,
+        1.0059461453281292278e-6, 1.0729837434500161228e-4,
+        0.051503226940658446941,
+        5.3771611477352308649e-14, -1.1396193006413731702e-12,
+        1.2858641335221653409e-11, -5.9802086004570057703e-11,
+        7.3666894305929510222e-10, 1.6731837150730356448e-8,
+        1.0070831435812128922e-6, 1.0729733111203704813e-4,
+        0.051503227360726294675,
+        3.7819492084858931093e-14, -4.8600496888588034879e-13,
+        1.6898350504817224909e-12, 4.5884624327524255865e-11,
+        1.2521615963377513729e-10, 1.8959658437754727957e-8,
+        1.0020716710561353622e-6, 1.073037119856927559e-4,
+        0.05150322383300230775
+    };
+
+    w = fabs(x);
+    if (w < 8.5) {
+        t = w * w * 0.0625;
+        k = 13 * ((int) t);
+        y = (((((((((((a[k] * t + a[k + 1]) * t +
+            a[k + 2]) * t + a[k + 3]) * t + a[k + 4]) * t +
+            a[k + 5]) * t + a[k + 6]) * t + a[k + 7]) * t +
+            a[k + 8]) * t + a[k + 9]) * t + a[k + 10]) * t +
+            a[k + 11]) * t + a[k + 12];
+    } else if (w < 12.5) {
+        k = (int) w;
+        t = w - k;
+        k = 14 * (k - 8);
+        y = ((((((((((((b[k] * t + b[k + 1]) * t +
+            b[k + 2]) * t + b[k + 3]) * t + b[k + 4]) * t +
+            b[k + 5]) * t + b[k + 6]) * t + b[k + 7]) * t +
+            b[k + 8]) * t + b[k + 9]) * t + b[k + 10]) * t +
+            b[k + 11]) * t + b[k + 12]) * t + b[k + 13];
+    } else {
+        t = 60 / w;
+        k = 9 * ((int) t);
+        y = ((((((((c[k] * t + c[k + 1]) * t +
+            c[k + 2]) * t + c[k + 3]) * t + c[k + 4]) * t +
+            c[k + 5]) * t + c[k + 6]) * t + c[k + 7]) * t +
+            c[k + 8]) * sqrt(t) * exp(w);
+    }
+    return y;
+}
diff --git a/src/dev32s.h b/src/dev32s.h
new file mode 100644
index 0000000..7edae86
--- /dev/null
+++ b/src/dev32s.h
@@ -0,0 +1,54 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_dev32s_included
+#define soxr_dev32s_included
+
+#if defined __GNUC__
+  #define SIMD_INLINE(T) static __inline T __attribute__((always_inline))
+  #define vAlign __attribute__((aligned (16)))
+#elif defined _MSC_VER
+  #define SIMD_INLINE(T) static __forceinline T
+  #define vAlign __declspec(align(16))
+#endif
+
+#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86
+
+#include <xmmintrin.h>
+
+#define vZero()      _mm_setzero_ps()
+#define vSet1(a)     _mm_set_ss(a)
+#define vMul(a,b)    _mm_mul_ps(a,b)
+#define vAdd(a,b)    _mm_add_ps(a,b)
+#define vMac(a,b,c)  vAdd(vMul(a,b),c)
+#define vLds(a)      _mm_set1_ps(a)
+#define vLd(a)       _mm_load_ps(a)
+#define vLdu(a)      _mm_loadu_ps(a)
+
+typedef __m128 v4_t;
+
+SIMD_INLINE(void) vStorSum(float * a, v4_t b) {
+  v4_t t = vAdd(_mm_movehl_ps(b, b), b);
+  _mm_store_ss(a, vAdd(t, _mm_shuffle_ps(t,t,1)));}
+
+#elif defined __arm__
+
+#include <arm_neon.h>
+
+#define vZero()      vdupq_n_f32(0)
+#define vMul(a,b)    vmulq_f32(a,b)
+#define vAdd(a,b)    vaddq_f32(a,b)
+#define vMac(a,b,c)  vmlaq_f32(c,a,b)
+#define vLds(a)      vld1q_dup_f32(&(a))
+#define vLd(a)       vld1q_f32(a)
+#define vLdu(a)      vld1q_f32(a)
+
+typedef float32x4_t v4_t;
+
+SIMD_INLINE(void) vStorSum(float * a, v4_t b) {
+  float32x2_t t = vadd_f32(vget_high_f32(b), vget_low_f32(b));
+  *a = vget_lane_f32(vpadd_f32(t, t), 0);}
+
+#endif
+
+#endif
diff --git a/src/dev64s.h b/src/dev64s.h
new file mode 100644
index 0000000..4672210
--- /dev/null
+++ b/src/dev64s.h
@@ -0,0 +1,42 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_dev64s_included
+#define soxr_dev64s_included
+
+#if defined __GNUC__
+  #define SIMD_INLINE(T) static __inline T __attribute__((always_inline))
+  #define vAlign __attribute__((aligned (32)))
+#elif defined _MSC_VER
+  #define SIMD_INLINE(T) static __forceinline T
+  #define vAlign __declspec(align(32))
+#else
+  #define SIMD_INLINE(T) static __inline T
+#endif
+
+#if defined __x86_64__ || defined _M_X64 || defined i386 || defined _M_IX86
+
+#include <immintrin.h>
+
+#if defined __AVX__
+
+#define vZero()      _mm256_setzero_pd()
+#define vSet1(a)     _mm256_set_pd(0,0,0,a)
+#define vMul(a,b)    _mm256_mul_pd(a,b)
+#define vAdd(a,b)    _mm256_add_pd(a,b)
+#define vMac(a,b,c)  vAdd(vMul(a,b),c) /* Note: gcc -mfma will `fuse' these */
+#define vLds(a)      _mm256_set1_pd(a)
+#define vLd(a)       _mm256_load_pd(a)
+#define vLdu(a)      _mm256_loadu_pd(a)
+
+typedef __m256d v4_t;
+
+SIMD_INLINE(void) vStorSum(double * a, v4_t b) {
+  b = _mm256_hadd_pd(b, _mm256_permute2f128_pd(b,b,1));
+  _mm_store_sd(a, _mm256_castpd256_pd128(_mm256_hadd_pd(b,b)));}
+
+#endif
+
+#endif
+
+#endif
diff --git a/src/fft4g.c b/src/fft4g.c
new file mode 100644
index 0000000..cf6293a
--- /dev/null
+++ b/src/fft4g.c
@@ -0,0 +1,1346 @@
+/* Copyright Takuya OOURA, 1996-2001.
+
+You may use, copy, modify and distribute this code for any
+purpose (include commercial use) and without fee.  Please
+refer to this package when you modify this code.
+
+Package home:  http://www.kurims.kyoto-u.ac.jp/~ooura/fft.html
+
+Fast Fourier/Cosine/Sine Transform
+    dimension   :one
+    data length :power of 2
+    decimation  :frequency
+    radix       :4, 2
+    data        :inplace
+    table       :use
+functions
+    cdft: Complex Discrete Fourier Transform
+    rdft: Real Discrete Fourier Transform
+    ddct: Discrete Cosine Transform
+    ddst: Discrete Sine Transform
+    dfct: Cosine Transform of RDFT (Real Symmetric DFT)
+    dfst: Sine Transform of RDFT (Real Anti-symmetric DFT)
+function prototypes
+    void cdft(int, int, double *, int *, double *);
+    void rdft(int, int, double *, int *, double *);
+    void ddct(int, int, double *, int *, double *);
+    void ddst(int, int, double *, int *, double *);
+    void dfct(int, double *, double *, int *, double *);
+    void dfst(int, double *, double *, int *, double *);
+
+
+-------- Complex DFT (Discrete Fourier Transform) --------
+    [definition]
+        <case1>
+            X[k] = sum_j=0^n-1 x[j]*exp(2*pi*i*j*k/n), 0<=k<n
+        <case2>
+            X[k] = sum_j=0^n-1 x[j]*exp(-2*pi*i*j*k/n), 0<=k<n
+        (notes: sum_j=0^n-1 is a summation from j=0 to n-1)
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            cdft(2*n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            cdft(2*n, -1, a, ip, w);
+    [parameters]
+        2*n            :data length (int)
+                        n >= 1, n = power of 2
+        a[0...2*n-1]   :input/output data (double *)
+                        input data
+                            a[2*j] = Re(x[j]),
+                            a[2*j+1] = Im(x[j]), 0<=j<n
+                        output data
+                            a[2*k] = Re(X[k]),
+                            a[2*k+1] = Im(X[k]), 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            cdft(2*n, -1, a, ip, w);
+        is
+            cdft(2*n, 1, a, ip, w);
+            for (j = 0; j <= 2 * n - 1; j++) {
+                a[j] *= 1.0 / n;
+            }
+        .
+
+
+-------- Real DFT / Inverse of Real DFT --------
+    [definition]
+        <case1> RDFT
+            R[k] = sum_j=0^n-1 a[j]*cos(2*pi*j*k/n), 0<=k<=n/2
+            I[k] = sum_j=0^n-1 a[j]*sin(2*pi*j*k/n), 0<k<n/2
+        <case2> IRDFT (excluding scale)
+            a[k] = (R[0] + R[n/2]*cos(pi*k))/2 +
+                   sum_j=1^n/2-1 R[j]*cos(2*pi*j*k/n) +
+                   sum_j=1^n/2-1 I[j]*sin(2*pi*j*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            rdft(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            rdft(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            output data
+                                a[2*k] = R[k], 0<=k<n/2
+                                a[2*k+1] = I[k], 0<k<n/2
+                                a[1] = R[n/2]
+                        <case2>
+                            input data
+                                a[2*j] = R[j], 0<=j<n/2
+                                a[2*j+1] = I[j], 0<j<n/2
+                                a[1] = R[n/2]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n/2-1]   :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            rdft(n, 1, a, ip, w);
+        is
+            rdft(n, -1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DCT (Discrete Cosine Transform) / Inverse of DCT --------
+    [definition]
+        <case1> IDCT (excluding scale)
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DCT
+            C[k] = sum_j=0^n-1 a[j]*cos(pi*(j+1/2)*k/n), 0<=k<n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddct(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddct(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<n
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            ddct(n, -1, a, ip, w);
+        is
+            a[0] *= 0.5;
+            ddct(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- DST (Discrete Sine Transform) / Inverse of DST --------
+    [definition]
+        <case1> IDST (excluding scale)
+            S[k] = sum_j=1^n A[j]*sin(pi*j*(k+1/2)/n), 0<=k<n
+        <case2> DST
+            S[k] = sum_j=0^n-1 a[j]*sin(pi*(j+1/2)*k/n), 0<k<=n
+    [usage]
+        <case1>
+            ip[0] = 0; // first time only
+            ddst(n, 1, a, ip, w);
+        <case2>
+            ip[0] = 0; // first time only
+            ddst(n, -1, a, ip, w);
+    [parameters]
+        n              :data length (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        <case1>
+                            input data
+                                a[j] = A[j], 0<j<n
+                                a[0] = A[n]
+                            output data
+                                a[k] = S[k], 0<=k<n
+                        <case2>
+                            output data
+                                a[k] = S[k], 0<k<n
+                                a[0] = S[n]
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/2)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/2+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/4-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            ddst(n, -1, a, ip, w);
+        is
+            a[0] *= 0.5;
+            ddst(n, 1, a, ip, w);
+            for (j = 0; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Cosine Transform of RDFT (Real Symmetric DFT) --------
+    [definition]
+        C[k] = sum_j=0^n a[j]*cos(pi*j*k/n), 0<=k<=n
+    [usage]
+        ip[0] = 0; // first time only
+        dfct(n, a, t, ip, w);
+    [parameters]
+        n              :data length - 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n]       :input/output data (double *)
+                        output data
+                            a[k] = C[k], 0<=k<=n
+        t[0...n/2]     :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+        is
+            a[0] *= 0.5;
+            a[n] *= 0.5;
+            dfct(n, a, t, ip, w);
+            for (j = 0; j <= n; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+-------- Sine Transform of RDFT (Real Anti-symmetric DFT) --------
+    [definition]
+        S[k] = sum_j=1^n-1 a[j]*sin(pi*j*k/n), 0<k<n
+    [usage]
+        ip[0] = 0; // first time only
+        dfst(n, a, t, ip, w);
+    [parameters]
+        n              :data length + 1 (int)
+                        n >= 2, n = power of 2
+        a[0...n-1]     :input/output data (double *)
+                        output data
+                            a[k] = S[k], 0<k<n
+                        (a[0] is used for work area)
+        t[0...n/2-1]   :work area (double *)
+        ip[0...*]      :work area for bit reversal (int *)
+                        length of ip >= 2+sqrt(n/4)
+                        strictly,
+                        length of ip >=
+                            2+(1<<(int)(log(n/4+0.5)/log(2))/2).
+                        ip[0],ip[1] are pointers of the cos/sin table.
+        w[0...n*5/8-1] :cos/sin table (double *)
+                        w[],ip[] are initialized if ip[0] == 0.
+    [remark]
+        Inverse of
+            dfst(n, a, t, ip, w);
+        is
+            dfst(n, a, t, ip, w);
+            for (j = 1; j <= n - 1; j++) {
+                a[j] *= 2.0 / n;
+            }
+        .
+
+
+Appendix :
+    The cos/sin table is recalculated when the larger table required.
+    w[] and ip[] are compatible with all routines.
+*/
+
+
+#include "math-wrap.h"
+#include "fft4g.h"
+
+#ifdef FFT4G_FLOAT
+  #define double float
+  #define one_half 0.5f
+
+  #define sin(x)   sinf(x)
+  #define cos(x)   cosf(x)
+  #define atan(x)  atanf(x)
+
+  #define cdft  lsx_cdft_f
+  #define rdft  lsx_rdft_f
+  #define ddct  lsx_ddct_f
+  #define ddst  lsx_ddst_f
+  #define dfct  lsx_dfct_f
+  #define dfst  lsx_dfst_f
+#else
+  #define one_half 0.5
+  #define cdft  lsx_cdft
+  #define rdft  lsx_rdft
+  #define ddct  lsx_ddct
+  #define ddst  lsx_ddst
+  #define dfct  lsx_dfct
+  #define dfst  lsx_dfst
+#endif
+
+static void bitrv2conj(int n, int *ip, double *a);
+static void bitrv2(int n, int *ip, double *a);
+static void cft1st(int n, double *a, double const *w);
+static void cftbsub(int n, double *a, double const *w);
+static void cftfsub(int n, double *a, double const *w);
+static void cftmdl(int n, int l, double *a, double const *w);
+static void dctsub(int n, double *a, int nc, double const *c);
+static void dstsub(int n, double *a, int nc, double const *c);
+static void makect(int nc, int *ip, double *c);
+static void makewt(int nw, int *ip, double *w);
+static void rftbsub(int n, double *a, int nc, double const *c);
+static void rftfsub(int n, double *a, int nc, double const *c);
+
+
+void cdft(int n, int isgn, double *a, int *ip, double *w)
+{
+    if (n > (ip[0] << 2)) {
+        makewt(n >> 2, ip, w);
+    }
+    if (n > 4) {
+        if (isgn >= 0) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+        } else {
+            bitrv2conj(n, ip + 2, a);
+            cftbsub(n, a, w);
+        }
+    } else if (n == 4) {
+        cftfsub(n, a, w);
+    }
+}
+
+
+void rdft(int n, int isgn, double *a, int *ip, double *w)
+{
+    int nw, nc;
+    double xi;
+
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 2)) {
+        nc = n >> 2;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+        xi = a[0] - a[1];
+        a[0] += a[1];
+        a[1] = xi;
+    } else {
+        a[1] = one_half * (a[0] - a[1]);
+        a[0] -= a[1];
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            bitrv2(n, ip + 2, a);
+            cftbsub(n, a, w);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+    }
+}
+
+
+void ddct(int n, int isgn, double *a, int *ip, double *w)
+{
+    int j, nw, nc;
+    double xr;
+
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = a[j] - a[j - 1];
+            a[j] += a[j - 1];
+        }
+        a[1] = a[0] - xr;
+        a[0] += xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            bitrv2(n, ip + 2, a);
+            cftbsub(n, a, w);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+    }
+    dctsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = a[j] - a[j + 1];
+            a[j] += a[j + 1];
+        }
+        a[n - 1] = xr;
+    }
+}
+
+
+void ddst(int n, int isgn, double *a, int *ip, double *w)
+{
+    int j, nw, nc;
+    double xr;
+
+    nw = ip[0];
+    if (n > (nw << 2)) {
+        nw = n >> 2;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > nc) {
+        nc = n;
+        makect(nc, ip, w + nw);
+    }
+    if (isgn < 0) {
+        xr = a[n - 1];
+        for (j = n - 2; j >= 2; j -= 2) {
+            a[j + 1] = -a[j] - a[j - 1];
+            a[j] -= a[j - 1];
+        }
+        a[1] = a[0] + xr;
+        a[0] -= xr;
+        if (n > 4) {
+            rftbsub(n, a, nc, w + nw);
+            bitrv2(n, ip + 2, a);
+            cftbsub(n, a, w);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+    }
+    dstsub(n, a, nc, w + nw);
+    if (isgn >= 0) {
+        if (n > 4) {
+            bitrv2(n, ip + 2, a);
+            cftfsub(n, a, w);
+            rftfsub(n, a, nc, w + nw);
+        } else if (n == 4) {
+            cftfsub(n, a, w);
+        }
+        xr = a[0] - a[1];
+        a[0] += a[1];
+        for (j = 2; j < n; j += 2) {
+            a[j - 1] = -a[j] - a[j + 1];
+            a[j] -= a[j + 1];
+        }
+        a[n - 1] = -xr;
+    }
+}
+
+
+void dfct(int n, double *a, double *t, int *ip, double *w)
+{
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    m = n >> 1;
+    yi = a[m];
+    xi = a[0] + a[n];
+    a[0] -= a[n];
+    t[0] = xi - yi;
+    t[m] = xi + yi;
+    if (n > 2) {
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] - a[n - j];
+            xi = a[j] + a[n - j];
+            yr = a[k] - a[n - k];
+            yi = a[k] + a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi - yi;
+            t[k] = xi + yi;
+        }
+        t[mh] = a[mh] + a[n - mh];
+        a[mh] -= a[n - mh];
+        dctsub(m, a, nc, w + nw);
+        if (m > 4) {
+            bitrv2(m, ip + 2, a);
+            cftfsub(m, a, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, w);
+        }
+        a[n - 1] = a[0] - a[1];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] + a[j + 1];
+            a[2 * j - 1] = a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dctsub(m, t, nc, w + nw);
+            if (m > 4) {
+                bitrv2(m, ip + 2, t);
+                cftfsub(m, t, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, w);
+            }
+            a[n - l] = t[0] - t[1];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = t[j] - t[j + 1];
+                a[k + l] = t[j] + t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 0; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] - t[m + j];
+                t[k] = t[m + k] + t[m + j];
+            }
+            t[mh] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+        a[n] = t[2] - t[1];
+        a[0] = t[2] + t[1];
+    } else {
+        a[1] = a[0];
+        a[2] = t[0];
+        a[0] = t[1];
+    }
+}
+
+
+void dfst(int n, double *a, double *t, int *ip, double *w)
+{
+    int j, k, l, m, mh, nw, nc;
+    double xr, xi, yr, yi;
+
+    nw = ip[0];
+    if (n > (nw << 3)) {
+        nw = n >> 3;
+        makewt(nw, ip, w);
+    }
+    nc = ip[1];
+    if (n > (nc << 1)) {
+        nc = n >> 1;
+        makect(nc, ip, w + nw);
+    }
+    if (n > 2) {
+        m = n >> 1;
+        mh = m >> 1;
+        for (j = 1; j < mh; j++) {
+            k = m - j;
+            xr = a[j] + a[n - j];
+            xi = a[j] - a[n - j];
+            yr = a[k] + a[n - k];
+            yi = a[k] - a[n - k];
+            a[j] = xr;
+            a[k] = yr;
+            t[j] = xi + yi;
+            t[k] = xi - yi;
+        }
+        t[0] = a[mh] - a[n - mh];
+        a[mh] += a[n - mh];
+        a[0] = a[m];
+        dstsub(m, a, nc, w + nw);
+        if (m > 4) {
+            bitrv2(m, ip + 2, a);
+            cftfsub(m, a, w);
+            rftfsub(m, a, nc, w + nw);
+        } else if (m == 4) {
+            cftfsub(m, a, w);
+        }
+        a[n - 1] = a[1] - a[0];
+        a[1] = a[0] + a[1];
+        for (j = m - 2; j >= 2; j -= 2) {
+            a[2 * j + 1] = a[j] - a[j + 1];
+            a[2 * j - 1] = -a[j] - a[j + 1];
+        }
+        l = 2;
+        m = mh;
+        while (m >= 2) {
+            dstsub(m, t, nc, w + nw);
+            if (m > 4) {
+                bitrv2(m, ip + 2, t);
+                cftfsub(m, t, w);
+                rftfsub(m, t, nc, w + nw);
+            } else if (m == 4) {
+                cftfsub(m, t, w);
+            }
+            a[n - l] = t[1] - t[0];
+            a[l] = t[0] + t[1];
+            k = 0;
+            for (j = 2; j < m; j += 2) {
+                k += l << 2;
+                a[k - l] = -t[j] - t[j + 1];
+                a[k + l] = t[j] - t[j + 1];
+            }
+            l <<= 1;
+            mh = m >> 1;
+            for (j = 1; j < mh; j++) {
+                k = m - j;
+                t[j] = t[m + k] + t[m + j];
+                t[k] = t[m + k] - t[m + j];
+            }
+            t[0] = t[m + mh];
+            m = mh;
+        }
+        a[l] = t[0];
+    }
+    a[0] = 0;
+}
+
+
+/* -------- initializing routines -------- */
+
+
+static void makewt(int nw, int *ip, double *w)
+{
+    int j, nwh;
+    double delta, x, y;
+
+    ip[0] = nw;
+    ip[1] = 1;
+    if (nw > 2) {
+        nwh = nw >> 1;
+        delta = atan(1.0) / (double)nwh;
+        w[0] = 1;
+        w[1] = 0;
+        w[nwh] = cos(delta * (double)nwh);
+        w[nwh + 1] = w[nwh];
+        if (nwh > 2) {
+            for (j = 2; j < nwh; j += 2) {
+                x = cos(delta * (double)j);
+                y = sin(delta * (double)j);
+                w[j] = x;
+                w[j + 1] = y;
+                w[nw - j] = y;
+                w[nw - j + 1] = x;
+            }
+            bitrv2(nw, ip + 2, w);
+        }
+    }
+}
+
+
+static void makect(int nc, int *ip, double *c)
+{
+    int j, nch;
+    double delta;
+
+    ip[1] = nc;
+    if (nc > 1) {
+        nch = nc >> 1;
+        delta = atan(1.0) / (double)nch;
+        c[0] = cos(delta * (double)nch);
+        c[nch] = one_half * c[0];
+        for (j = 1; j < nch; j++) {
+            c[j] = one_half * cos(delta * (double)j);
+            c[nc - j] = one_half * sin(delta * (double)j);
+        }
+    }
+}
+
+
+/* -------- child routines -------- */
+
+
+static void bitrv2(int n, int *ip0, double *a)
+{
+    int j, j1, k, k1, l, m, m2, ip[1024];
+    double xr, xi, yr, yi;
+
+    (void)ip0;
+    ip[0] = 0;
+    l = n;
+    m = 1;
+    while ((m << 3) < l) {
+        l >>= 1;
+        for (j = 0; j < m; j++) {
+            ip[m + j] = ip[j] + l;
+        }
+        m <<= 1;
+    }
+    m2 = 2 * m;
+    if ((m << 3) == l) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 -= m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            j1 = 2 * k + m2 + ip[k];
+            k1 = j1 + m2;
+            xr = a[j1];
+            xi = a[j1 + 1];
+            yr = a[k1];
+            yi = a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+        }
+    } else {
+        for (k = 1; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += m2;
+                xr = a[j1];
+                xi = a[j1 + 1];
+                yr = a[k1];
+                yi = a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+        }
+    }
+}
+
+
+static void bitrv2conj(int n, int *ip0, double *a)
+{
+    int j, j1, k, k1, l, m, m2, ip[512];
+    double xr, xi, yr, yi;
+
+    (void)ip0;
+    ip[0] = 0;
+    l = n;
+    m = 1;
+    while ((m << 3) < l) {
+        l >>= 1;
+        for (j = 0; j < m; j++) {
+            ip[m + j] = ip[j] + l;
+        }
+        m <<= 1;
+    }
+    m2 = 2 * m;
+    if ((m << 3) == l) {
+        for (k = 0; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 -= m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += 2 * m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 2 * k + ip[k];
+            a[k1 + 1] = -a[k1 + 1];
+            j1 = k1 + m2;
+            k1 = j1 + m2;
+            xr = a[j1];
+            xi = -a[j1 + 1];
+            yr = a[k1];
+            yi = -a[k1 + 1];
+            a[j1] = yr;
+            a[j1 + 1] = yi;
+            a[k1] = xr;
+            a[k1 + 1] = xi;
+            k1 += m2;
+            a[k1 + 1] = -a[k1 + 1];
+        }
+    } else {
+        a[1] = -a[1];
+        a[m2 + 1] = -a[m2 + 1];
+        for (k = 1; k < m; k++) {
+            for (j = 0; j < k; j++) {
+                j1 = 2 * j + ip[k];
+                k1 = 2 * k + ip[j];
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+                j1 += m2;
+                k1 += m2;
+                xr = a[j1];
+                xi = -a[j1 + 1];
+                yr = a[k1];
+                yi = -a[k1 + 1];
+                a[j1] = yr;
+                a[j1 + 1] = yi;
+                a[k1] = xr;
+                a[k1 + 1] = xi;
+            }
+            k1 = 2 * k + ip[k];
+            a[k1 + 1] = -a[k1 + 1];
+            a[k1 + m2 + 1] = -a[k1 + m2 + 1];
+        }
+    }
+}
+
+
+static void cftfsub(int n, double *a, double const *w)
+{
+    int j, j1, j2, j3, l;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    l = 2;
+    if (n > 8) {
+        cft1st(n, a, w);
+        l = 8;
+        while ((l << 2) < n) {
+            cftmdl(n, l, a, w);
+            l <<= 2;
+        }
+    }
+    if ((l << 2) == n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            a[j2] = x0r - x2r;
+            a[j2 + 1] = x0i - x2i;
+            a[j1] = x1r - x3i;
+            a[j1 + 1] = x1i + x3r;
+            a[j3] = x1r + x3i;
+            a[j3 + 1] = x1i - x3r;
+        }
+    } else {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            x0r = a[j] - a[j1];
+            x0i = a[j + 1] - a[j1 + 1];
+            a[j] += a[j1];
+            a[j + 1] += a[j1 + 1];
+            a[j1] = x0r;
+            a[j1 + 1] = x0i;
+        }
+    }
+}
+
+
+static void cftbsub(int n, double *a, double const *w)
+{
+    int j, j1, j2, j3, l;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    l = 2;
+    if (n > 8) {
+        cft1st(n, a, w);
+        l = 8;
+        while ((l << 2) < n) {
+            cftmdl(n, l, a, w);
+            l <<= 2;
+        }
+    }
+    if ((l << 2) == n) {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = -a[j + 1] - a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = -a[j + 1] + a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i - x2i;
+            a[j2] = x0r - x2r;
+            a[j2 + 1] = x0i + x2i;
+            a[j1] = x1r - x3i;
+            a[j1 + 1] = x1i - x3r;
+            a[j3] = x1r + x3i;
+            a[j3 + 1] = x1i + x3r;
+        }
+    } else {
+        for (j = 0; j < l; j += 2) {
+            j1 = j + l;
+            x0r = a[j] - a[j1];
+            x0i = -a[j + 1] + a[j1 + 1];
+            a[j] += a[j1];
+            a[j + 1] = -a[j + 1] - a[j1 + 1];
+            a[j1] = x0r;
+            a[j1 + 1] = x0i;
+        }
+    }
+}
+
+
+static void cft1st(int n, double *a, double const *w)
+{
+    int j, k1, k2;
+    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    x0r = a[0] + a[2];
+    x0i = a[1] + a[3];
+    x1r = a[0] - a[2];
+    x1i = a[1] - a[3];
+    x2r = a[4] + a[6];
+    x2i = a[5] + a[7];
+    x3r = a[4] - a[6];
+    x3i = a[5] - a[7];
+    a[0] = x0r + x2r;
+    a[1] = x0i + x2i;
+    a[4] = x0r - x2r;
+    a[5] = x0i - x2i;
+    a[2] = x1r - x3i;
+    a[3] = x1i + x3r;
+    a[6] = x1r + x3i;
+    a[7] = x1i - x3r;
+    wk1r = w[2];
+    x0r = a[8] + a[10];
+    x0i = a[9] + a[11];
+    x1r = a[8] - a[10];
+    x1i = a[9] - a[11];
+    x2r = a[12] + a[14];
+    x2i = a[13] + a[15];
+    x3r = a[12] - a[14];
+    x3i = a[13] - a[15];
+    a[8] = x0r + x2r;
+    a[9] = x0i + x2i;
+    a[12] = x2i - x0i;
+    a[13] = x0r - x2r;
+    x0r = x1r - x3i;
+    x0i = x1i + x3r;
+    a[10] = wk1r * (x0r - x0i);
+    a[11] = wk1r * (x0r + x0i);
+    x0r = x3i + x1r;
+    x0i = x3r - x1i;
+    a[14] = wk1r * (x0i - x0r);
+    a[15] = wk1r * (x0i + x0r);
+    k1 = 0;
+    for (j = 16; j < n; j += 16) {
+        k1 += 2;
+        k2 = 2 * k1;
+        wk2r = w[k1];
+        wk2i = w[k1 + 1];
+        wk1r = w[k2];
+        wk1i = w[k2 + 1];
+        wk3r = wk1r - 2 * wk2i * wk1i;
+        wk3i = 2 * wk2i * wk1r - wk1i;
+        x0r = a[j] + a[j + 2];
+        x0i = a[j + 1] + a[j + 3];
+        x1r = a[j] - a[j + 2];
+        x1i = a[j + 1] - a[j + 3];
+        x2r = a[j + 4] + a[j + 6];
+        x2i = a[j + 5] + a[j + 7];
+        x3r = a[j + 4] - a[j + 6];
+        x3i = a[j + 5] - a[j + 7];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        x0r -= x2r;
+        x0i -= x2i;
+        a[j + 4] = wk2r * x0r - wk2i * x0i;
+        a[j + 5] = wk2r * x0i + wk2i * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j + 2] = wk1r * x0r - wk1i * x0i;
+        a[j + 3] = wk1r * x0i + wk1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j + 6] = wk3r * x0r - wk3i * x0i;
+        a[j + 7] = wk3r * x0i + wk3i * x0r;
+        wk1r = w[k2 + 2];
+        wk1i = w[k2 + 3];
+        wk3r = wk1r - 2 * wk2r * wk1i;
+        wk3i = 2 * wk2r * wk1r - wk1i;
+        x0r = a[j + 8] + a[j + 10];
+        x0i = a[j + 9] + a[j + 11];
+        x1r = a[j + 8] - a[j + 10];
+        x1i = a[j + 9] - a[j + 11];
+        x2r = a[j + 12] + a[j + 14];
+        x2i = a[j + 13] + a[j + 15];
+        x3r = a[j + 12] - a[j + 14];
+        x3i = a[j + 13] - a[j + 15];
+        a[j + 8] = x0r + x2r;
+        a[j + 9] = x0i + x2i;
+        x0r -= x2r;
+        x0i -= x2i;
+        a[j + 12] = -wk2i * x0r - wk2r * x0i;
+        a[j + 13] = -wk2i * x0i + wk2r * x0r;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j + 10] = wk1r * x0r - wk1i * x0i;
+        a[j + 11] = wk1r * x0i + wk1i * x0r;
+        x0r = x1r + x3i;
+        x0i = x1i - x3r;
+        a[j + 14] = wk3r * x0r - wk3i * x0i;
+        a[j + 15] = wk3r * x0i + wk3i * x0r;
+    }
+}
+
+
+static void cftmdl(int n, int l, double *a, double const *w)
+{
+    int j, j1, j2, j3, k, k1, k2, m, m2;
+    double wk1r, wk1i, wk2r, wk2i, wk3r, wk3i;
+    double x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
+
+    m = l << 2;
+    for (j = 0; j < l; j += 2) {
+        j1 = j + l;
+        j2 = j1 + l;
+        j3 = j2 + l;
+        x0r = a[j] + a[j1];
+        x0i = a[j + 1] + a[j1 + 1];
+        x1r = a[j] - a[j1];
+        x1i = a[j + 1] - a[j1 + 1];
+        x2r = a[j2] + a[j3];
+        x2i = a[j2 + 1] + a[j3 + 1];
+        x3r = a[j2] - a[j3];
+        x3i = a[j2 + 1] - a[j3 + 1];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j2] = x0r - x2r;
+        a[j2 + 1] = x0i - x2i;
+        a[j1] = x1r - x3i;
+        a[j1 + 1] = x1i + x3r;
+        a[j3] = x1r + x3i;
+        a[j3 + 1] = x1i - x3r;
+    }
+    wk1r = w[2];
+    for (j = m; j < l + m; j += 2) {
+        j1 = j + l;
+        j2 = j1 + l;
+        j3 = j2 + l;
+        x0r = a[j] + a[j1];
+        x0i = a[j + 1] + a[j1 + 1];
+        x1r = a[j] - a[j1];
+        x1i = a[j + 1] - a[j1 + 1];
+        x2r = a[j2] + a[j3];
+        x2i = a[j2 + 1] + a[j3 + 1];
+        x3r = a[j2] - a[j3];
+        x3i = a[j2 + 1] - a[j3 + 1];
+        a[j] = x0r + x2r;
+        a[j + 1] = x0i + x2i;
+        a[j2] = x2i - x0i;
+        a[j2 + 1] = x0r - x2r;
+        x0r = x1r - x3i;
+        x0i = x1i + x3r;
+        a[j1] = wk1r * (x0r - x0i);
+        a[j1 + 1] = wk1r * (x0r + x0i);
+        x0r = x3i + x1r;
+        x0i = x3r - x1i;
+        a[j3] = wk1r * (x0i - x0r);
+        a[j3 + 1] = wk1r * (x0i + x0r);
+    }
+    k1 = 0;
+    m2 = 2 * m;
+    for (k = m2; k < n; k += m2) {
+        k1 += 2;
+        k2 = 2 * k1;
+        wk2r = w[k1];
+        wk2i = w[k1 + 1];
+        wk1r = w[k2];
+        wk1i = w[k2 + 1];
+        wk3r = wk1r - 2 * wk2i * wk1i;
+        wk3i = 2 * wk2i * wk1r - wk1i;
+        for (j = k; j < l + k; j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            x0r -= x2r;
+            x0i -= x2i;
+            a[j2] = wk2r * x0r - wk2i * x0i;
+            a[j2 + 1] = wk2r * x0i + wk2i * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j1] = wk1r * x0r - wk1i * x0i;
+            a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j3] = wk3r * x0r - wk3i * x0i;
+            a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+        }
+        wk1r = w[k2 + 2];
+        wk1i = w[k2 + 3];
+        wk3r = wk1r - 2 * wk2r * wk1i;
+        wk3i = 2 * wk2r * wk1r - wk1i;
+        for (j = k + m; j < l + (k + m); j += 2) {
+            j1 = j + l;
+            j2 = j1 + l;
+            j3 = j2 + l;
+            x0r = a[j] + a[j1];
+            x0i = a[j + 1] + a[j1 + 1];
+            x1r = a[j] - a[j1];
+            x1i = a[j + 1] - a[j1 + 1];
+            x2r = a[j2] + a[j3];
+            x2i = a[j2 + 1] + a[j3 + 1];
+            x3r = a[j2] - a[j3];
+            x3i = a[j2 + 1] - a[j3 + 1];
+            a[j] = x0r + x2r;
+            a[j + 1] = x0i + x2i;
+            x0r -= x2r;
+            x0i -= x2i;
+            a[j2] = -wk2i * x0r - wk2r * x0i;
+            a[j2 + 1] = -wk2i * x0i + wk2r * x0r;
+            x0r = x1r - x3i;
+            x0i = x1i + x3r;
+            a[j1] = wk1r * x0r - wk1i * x0i;
+            a[j1 + 1] = wk1r * x0i + wk1i * x0r;
+            x0r = x1r + x3i;
+            x0i = x1i - x3r;
+            a[j3] = wk3r * x0r - wk3i * x0i;
+            a[j3 + 1] = wk3r * x0i + wk3i * x0r;
+        }
+    }
+}
+
+
+static void rftfsub(int n, double *a, int nc, double const *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = one_half - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr - wki * xi;
+        yi = wkr * xi + wki * xr;
+        a[j] -= yr;
+        a[j + 1] -= yi;
+        a[k] += yr;
+        a[k + 1] -= yi;
+    }
+}
+
+
+static void rftbsub(int n, double *a, int nc, double const *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr, xi, yr, yi;
+
+    a[1] = -a[1];
+    m = n >> 1;
+    ks = 2 * nc / m;
+    kk = 0;
+    for (j = 2; j < m; j += 2) {
+        k = n - j;
+        kk += ks;
+        wkr = one_half - c[nc - kk];
+        wki = c[kk];
+        xr = a[j] - a[k];
+        xi = a[j + 1] + a[k + 1];
+        yr = wkr * xr + wki * xi;
+        yi = wkr * xi - wki * xr;
+        a[j] -= yr;
+        a[j + 1] = yi - a[j + 1];
+        a[k] += yr;
+        a[k + 1] = yi - a[k + 1];
+    }
+    a[m + 1] = -a[m + 1];
+}
+
+
+static void dctsub(int n, double *a, int nc, double const *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[j] - wkr * a[k];
+        a[j] = wkr * a[j] + wki * a[k];
+        a[k] = xr;
+    }
+    a[m] *= c[0];
+}
+
+
+static void dstsub(int n, double *a, int nc, double const *c)
+{
+    int j, k, kk, ks, m;
+    double wkr, wki, xr;
+
+    m = n >> 1;
+    ks = nc / n;
+    kk = 0;
+    for (j = 1; j < m; j++) {
+        k = n - j;
+        kk += ks;
+        wkr = c[kk] - c[nc - kk];
+        wki = c[kk] + c[nc - kk];
+        xr = wki * a[k] - wkr * a[j];
+        a[k] = wkr * a[k] + wki * a[j];
+        a[j] = xr;
+    }
+    a[m] *= c[0];
+}
diff --git a/src/fft4g.h b/src/fft4g.h
new file mode 100644
index 0000000..0f906ab
--- /dev/null
+++ b/src/fft4g.h
@@ -0,0 +1,23 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+void lsx_cdft(int, int, double *, int *, double *);
+void lsx_rdft(int, int, double *, int *, double *);
+void lsx_ddct(int, int, double *, int *, double *);
+void lsx_ddst(int, int, double *, int *, double *);
+void lsx_dfct(int, double *, double *, int *, double *);
+void lsx_dfst(int, double *, double *, int *, double *);
+
+void lsx_cdft_f(int, int, float *, int *, float *);
+void lsx_rdft_f(int, int, float *, int *, float *);
+void lsx_ddct_f(int, int, float *, int *, float *);
+void lsx_ddst_f(int, int, float *, int *, float *);
+void lsx_dfct_f(int, float *, float *, int *, float *);
+void lsx_dfst_f(int, float *, float *, int *, float *);
+
+#define dft_br_len(l) (2ul + (1ul << (int)(log(l / 2 + .5) / log(2.)) / 2))
+#define dft_sc_len(l) ((unsigned long)l / 2)
+
+/* Over-allocate h by 2 to use these macros */
+#define LSX_PACK(h, n)   h[1] = h[n]
+#define LSX_UNPACK(h, n) h[n] = h[1], h[n + 1] = h[1] = 0;
diff --git a/src/fft4g32.c b/src/fft4g32.c
new file mode 100644
index 0000000..7a31ba4
--- /dev/null
+++ b/src/fft4g32.c
@@ -0,0 +1,36 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <stdlib.h>
+#include "filter.h"
+#define FFT4G_FLOAT
+#include "fft4g.c"
+#include "soxr-config.h"
+
+#if WITH_CR32
+#include "rdft_t.h"
+static void * null(void) {return 0;}
+static void forward (int length, void * setup, double * H) {lsx_safe_rdft_f(length,  1, H); (void)setup;}
+static void backward(int length, void * setup, double * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;}
+static int multiplier(void) {return 2;}
+static void nothing(void) {}
+static int flags(void) {return 0;}
+
+fn_t _soxr_rdft32_cb[] = {
+  (fn_t)null,
+  (fn_t)null,
+  (fn_t)nothing,
+  (fn_t)forward,
+  (fn_t)forward,
+  (fn_t)backward,
+  (fn_t)backward,
+  (fn_t)_soxr_ordered_convolve_f,
+  (fn_t)_soxr_ordered_partial_convolve_f,
+  (fn_t)multiplier,
+  (fn_t)nothing,
+  (fn_t)malloc,
+  (fn_t)calloc,
+  (fn_t)free,
+  (fn_t)flags,
+};
+#endif
diff --git a/src/fft4g32s.c b/src/fft4g32s.c
new file mode 100644
index 0000000..8ce9726
--- /dev/null
+++ b/src/fft4g32s.c
@@ -0,0 +1,31 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include "filter.h"
+#include "util32s.h"
+#include "rdft_t.h"
+
+static void * null(void) {return 0;}
+static void nothing(void) {}
+static void forward (int length, void * setup, float * H) {lsx_safe_rdft_f(length,  1, H); (void)setup;}
+static void backward(int length, void * setup, float * H) {lsx_safe_rdft_f(length, -1, H); (void)setup;}
+static int multiplier(void) {return 2;}
+static int flags(void) {return RDFT_IS_SIMD;}
+
+fn_t _soxr_rdft32s_cb[] = {
+  (fn_t)null,
+  (fn_t)null,
+  (fn_t)nothing,
+  (fn_t)forward,
+  (fn_t)forward,
+  (fn_t)backward,
+  (fn_t)backward,
+  (fn_t)ORDERED_CONVOLVE_SIMD,
+  (fn_t)ORDERED_PARTIAL_CONVOLVE_SIMD,
+  (fn_t)multiplier,
+  (fn_t)nothing,
+  (fn_t)SIMD_ALIGNED_MALLOC,
+  (fn_t)SIMD_ALIGNED_CALLOC,
+  (fn_t)SIMD_ALIGNED_FREE,
+  (fn_t)flags,
+};
diff --git a/src/fft4g64.c b/src/fft4g64.c
new file mode 100644
index 0000000..0018516
--- /dev/null
+++ b/src/fft4g64.c
@@ -0,0 +1,35 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <stdlib.h>
+#include "filter.h"
+#include "fft4g.c"
+#include "soxr-config.h"
+
+#if WITH_CR64
+static void * null(void) {return 0;}
+static void nothing(void) {}
+static void forward (int length, void * setup, double * H) {lsx_safe_rdft(length,  1, H); (void)setup;}
+static void backward(int length, void * setup, double * H) {lsx_safe_rdft(length, -1, H); (void)setup;}
+static int multiplier(void) {return 2;}
+static int flags(void) {return 0;}
+
+typedef void (* fn_t)(void);
+fn_t _soxr_rdft64_cb[] = {
+  (fn_t)null,
+  (fn_t)null,
+  (fn_t)nothing,
+  (fn_t)forward,
+  (fn_t)forward,
+  (fn_t)backward,
+  (fn_t)backward,
+  (fn_t)_soxr_ordered_convolve,
+  (fn_t)_soxr_ordered_partial_convolve,
+  (fn_t)multiplier,
+  (fn_t)nothing,
+  (fn_t)malloc,
+  (fn_t)calloc,
+  (fn_t)free,
+  (fn_t)flags,
+};
+#endif
diff --git a/src/fft4g_cache.h b/src/fft4g_cache.h
new file mode 100644
index 0000000..d776c16
--- /dev/null
+++ b/src/fft4g_cache.h
@@ -0,0 +1,92 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+static int * LSX_FFT_BR;
+static DFT_FLOAT * LSX_FFT_SC;
+static int FFT_LEN = -1;
+static ccrw2_t FFT_CACHE_CCRW;
+
+void LSX_INIT_FFT_CACHE(void)
+{
+  if (FFT_LEN >= 0)
+    return;
+  assert(LSX_FFT_BR == NULL);
+  assert(LSX_FFT_SC == NULL);
+  assert(FFT_LEN == -1);
+  ccrw2_init(FFT_CACHE_CCRW);
+  FFT_LEN = 0;
+}
+
+void LSX_CLEAR_FFT_CACHE(void)
+{
+  assert(FFT_LEN >= 0);
+  ccrw2_clear(FFT_CACHE_CCRW);
+  free(LSX_FFT_BR);
+  free(LSX_FFT_SC);
+  LSX_FFT_SC = NULL;
+  LSX_FFT_BR = NULL;
+  FFT_LEN = -1;
+}
+
+static bool UPDATE_FFT_CACHE(int len)
+{
+  LSX_INIT_FFT_CACHE();
+  assert(lsx_is_power_of_2(len));
+  assert(FFT_LEN >= 0);
+  ccrw2_become_reader(FFT_CACHE_CCRW);
+  if (len > FFT_LEN) {
+    ccrw2_cease_reading(FFT_CACHE_CCRW);
+    ccrw2_become_writer(FFT_CACHE_CCRW);
+    if (len > FFT_LEN) {
+      int old_n = FFT_LEN;
+      FFT_LEN = len;
+      LSX_FFT_BR = realloc(LSX_FFT_BR, dft_br_len(FFT_LEN) * sizeof(*LSX_FFT_BR));
+      LSX_FFT_SC = realloc(LSX_FFT_SC, dft_sc_len(FFT_LEN) * sizeof(*LSX_FFT_SC));
+      if (!old_n) {
+        LSX_FFT_BR[0] = 0;
+#if SOXR_LIB
+        atexit(LSX_CLEAR_FFT_CACHE);
+#endif
+      }
+      return true;
+    }
+    ccrw2_cease_writing(FFT_CACHE_CCRW);
+    ccrw2_become_reader(FFT_CACHE_CCRW);
+  }
+  return false;
+}
+
+static void DONE_WITH_FFT_CACHE(bool is_writer)
+{
+  if (is_writer)
+    ccrw2_cease_writing(FFT_CACHE_CCRW);
+  else ccrw2_cease_reading(FFT_CACHE_CCRW);
+}
+
+void LSX_SAFE_RDFT(int len, int type, DFT_FLOAT * d)
+{
+  bool is_writer = UPDATE_FFT_CACHE(len);
+  LSX_RDFT(len, type, d, LSX_FFT_BR, LSX_FFT_SC);
+  DONE_WITH_FFT_CACHE(is_writer);
+}
+
+void LSX_SAFE_CDFT(int len, int type, DFT_FLOAT * d)
+{
+  bool is_writer = UPDATE_FFT_CACHE(len);
+  LSX_CDFT(len, type, d, LSX_FFT_BR, LSX_FFT_SC);
+  DONE_WITH_FFT_CACHE(is_writer);
+}
+
+#undef UPDATE_FFT_CACHE
+#undef LSX_SAFE_RDFT
+#undef LSX_SAFE_CDFT
+#undef LSX_RDFT
+#undef LSX_INIT_FFT_CACHE
+#undef LSX_FFT_SC
+#undef LSX_FFT_BR
+#undef LSX_CLEAR_FFT_CACHE
+#undef LSX_CDFT
+#undef FFT_LEN
+#undef FFT_CACHE_CCRW
+#undef DONE_WITH_FFT_CACHE
+#undef DFT_FLOAT
diff --git a/src/fifo.h b/src/fifo.h
new file mode 100644
index 0000000..33af9fe
--- /dev/null
+++ b/src/fifo.h
@@ -0,0 +1,125 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#ifndef soxr_fifo_included
+#define soxr_fifo_included
+
+#if !defined FIFO_SIZE_T
+#define FIFO_SIZE_T size_t
+#endif
+
+#if !defined FIFO_REALLOC
+#include <stdlib.h>
+  #define FIFO_REALLOC(a,b,c) realloc(a,b)
+  #undef FIFO_FREE
+  #define FIFO_FREE free
+  #undef FIFO_MALLOC
+  #define FIFO_MALLOC malloc
+#endif
+
+typedef struct {
+  char * data;
+  size_t allocation;   /* Number of bytes allocated for data. */
+  size_t item_size;    /* Size of each item in data */
+  size_t begin;        /* Offset of the first byte to read. */
+  size_t end;          /* 1 + Offset of the last byte byte to read. */
+} fifo_t;
+
+#if !defined FIFO_MIN
+  #define FIFO_MIN 0x4000
+#endif
+
+#if !defined UNUSED
+  #define UNUSED
+#endif
+
+UNUSED static void fifo_clear(fifo_t * f)
+{
+  f->end = f->begin = 0;
+}
+
+UNUSED static void * fifo_reserve(fifo_t * f, FIFO_SIZE_T n0)
+{
+  size_t n = (size_t)n0;
+  n *= f->item_size;
+
+  if (f->begin == f->end)
+    fifo_clear(f);
+
+  while (1) {
+    if (f->end + n <= f->allocation) {
+      void *p = f->data + f->end;
+
+      f->end += n;
+      return p;
+    }
+    if (f->begin > FIFO_MIN) {
+      memmove(f->data, f->data + f->begin, f->end - f->begin);
+      f->end -= f->begin;
+      f->begin = 0;
+      continue;
+    }
+    f->data = FIFO_REALLOC(f->data, f->allocation + n, f->allocation);
+    f->allocation += n;
+    if (!f->data)
+      return 0;
+  }
+}
+
+UNUSED static void * fifo_write(fifo_t * f, FIFO_SIZE_T n0, void const * data)
+{
+  size_t n = (size_t)n0;
+  void * s = fifo_reserve(f, n0);
+  if (data)
+    memcpy(s, data, n * f->item_size);
+  return s;
+}
+
+UNUSED static void fifo_trim_to(fifo_t * f, FIFO_SIZE_T n0)
+{
+  size_t n = (size_t)n0;
+  n *= f->item_size;
+  f->end = f->begin + n;
+}
+
+UNUSED static void fifo_trim_by(fifo_t * f, FIFO_SIZE_T n0)
+{
+  size_t n = (size_t)n0;
+  n *= f->item_size;
+  f->end -= n;
+}
+
+UNUSED static FIFO_SIZE_T fifo_occupancy(fifo_t * f)
+{
+  return (FIFO_SIZE_T)((f->end - f->begin) / f->item_size);
+}
+
+UNUSED static void * fifo_read(fifo_t * f, FIFO_SIZE_T n0, void * data)
+{
+  size_t n = (size_t)n0;
+  char * ret = f->data + f->begin;
+  n *= f->item_size;
+  if (n > (f->end - f->begin))
+    return NULL;
+  if (data)
+    memcpy(data, ret, (size_t)n);
+  f->begin += n;
+  return ret;
+}
+
+#define fifo_read_ptr(f) fifo_read(f, (FIFO_SIZE_T)0, NULL)
+
+UNUSED static void fifo_delete(fifo_t * f)
+{
+  FIFO_FREE(f->data);
+}
+
+UNUSED static int fifo_create(fifo_t * f, FIFO_SIZE_T item_size)
+{
+  f->item_size = (size_t)item_size;
+  f->allocation = FIFO_MIN;
+  fifo_clear(f);
+  return !(f->data = FIFO_MALLOC(f->allocation));
+}
+
+#endif
diff --git a/src/filter.c b/src/filter.c
new file mode 100644
index 0000000..019d24d
--- /dev/null
+++ b/src/filter.c
@@ -0,0 +1,277 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include "filter.h"
+
+#include "math-wrap.h"
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "fft4g.h"
+#include "ccrw2.h"
+
+#if 1 || WITH_CR64 || WITH_CR64S /* Always need this, for lsx_fir_to_phase. */
+#define DFT_FLOAT double
+#define DONE_WITH_FFT_CACHE done_with_fft_cache
+#define FFT_CACHE_CCRW fft_cache_ccrw
+#define FFT_LEN fft_len
+#define LSX_CDFT lsx_cdft
+#define LSX_CLEAR_FFT_CACHE lsx_clear_fft_cache
+#define LSX_FFT_BR lsx_fft_br
+#define LSX_FFT_SC lsx_fft_sc
+#define LSX_INIT_FFT_CACHE lsx_init_fft_cache
+#define LSX_RDFT lsx_rdft
+#define LSX_SAFE_CDFT lsx_safe_cdft
+#define LSX_SAFE_RDFT lsx_safe_rdft
+#define UPDATE_FFT_CACHE update_fft_cache
+#include "fft4g_cache.h"
+#endif
+
+#if (WITH_CR32 && !AVCODEC_FOUND) || (WITH_CR32S && !AVCODEC_FOUND && !WITH_PFFFT)
+#define DFT_FLOAT float
+#define DONE_WITH_FFT_CACHE done_with_fft_cache_f
+#define FFT_CACHE_CCRW fft_cache_ccrw_f
+#define FFT_LEN fft_len_f
+#define LSX_CDFT lsx_cdft_f
+#define LSX_CLEAR_FFT_CACHE lsx_clear_fft_cache_f
+#define LSX_FFT_BR lsx_fft_br_f
+#define LSX_FFT_SC lsx_fft_sc_f
+#define LSX_INIT_FFT_CACHE lsx_init_fft_cache_f
+#define LSX_RDFT lsx_rdft_f
+#define LSX_SAFE_CDFT lsx_safe_cdft_f
+#define LSX_SAFE_RDFT lsx_safe_rdft_f
+#define UPDATE_FFT_CACHE update_fft_cache_f
+#include "fft4g_cache.h"
+#endif
+
+#if WITH_CR64 || WITH_CR64S || !SOXR_LIB
+#define DFT_FLOAT double
+#define ORDERED_CONVOLVE lsx_ordered_convolve
+#define ORDERED_PARTIAL_CONVOLVE lsx_ordered_partial_convolve
+#include "rdft.h"
+#endif
+
+#if WITH_CR32
+#define DFT_FLOAT float
+#define ORDERED_CONVOLVE lsx_ordered_convolve_f
+#define ORDERED_PARTIAL_CONVOLVE lsx_ordered_partial_convolve_f
+#include "rdft.h"
+#endif
+
+double lsx_kaiser_beta(double att, double tr_bw)
+{
+  if (att >= 60) {
+    static const double coefs[][4] = {
+      {-6.784957e-10,1.02856e-05,0.1087556,-0.8988365+.001},
+      {-6.897885e-10,1.027433e-05,0.10876,-0.8994658+.002},
+      {-1.000683e-09,1.030092e-05,0.1087677,-0.9007898+.003},
+      {-3.654474e-10,1.040631e-05,0.1087085,-0.8977766+.006},
+      {8.106988e-09,6.983091e-06,0.1091387,-0.9172048+.015},
+      {9.519571e-09,7.272678e-06,0.1090068,-0.9140768+.025},
+      {-5.626821e-09,1.342186e-05,0.1083999,-0.9065452+.05},
+      {-9.965946e-08,5.073548e-05,0.1040967,-0.7672778+.085},
+      {1.604808e-07,-5.856462e-05,0.1185998,-1.34824+.1},
+      {-1.511964e-07,6.363034e-05,0.1064627,-0.9876665+.18},
+    };
+    double realm = log(tr_bw/.0005)/log(2.);
+    double const * c0 = coefs[range_limit(  (int)realm, 0, (int)array_length(coefs)-1)];
+    double const * c1 = coefs[range_limit(1+(int)realm, 0, (int)array_length(coefs)-1)];
+    double b0 = ((c0[0]*att + c0[1])*att + c0[2])*att + c0[3];
+    double b1 = ((c1[0]*att + c1[1])*att + c1[2])*att + c1[3];
+    return b0 + (b1 - b0) * (realm - (int)realm);
+  }
+  if (att > 50   ) return .1102 * (att - 8.7);
+  if (att > 20.96) return .58417 * pow(att -20.96, .4) + .07886 * (att - 20.96);
+  return 0;
+}
+
+double * lsx_make_lpf(
+    int num_taps, double Fc, double beta, double rho, double scale)
+{
+  int i, m = num_taps - 1;
+  double * h = malloc((size_t)num_taps * sizeof(*h));
+  double mult = scale / lsx_bessel_I_0(beta), mult1 = 1 / (.5 * m + rho);
+  assert(Fc >= 0 && Fc <= 1);
+  lsx_debug("make_lpf(n=%i Fc=%.7g beta=%g rho=%g scale=%g)",
+      num_taps, Fc, beta, rho, scale);
+
+  if (h) for (i = 0; i <= m / 2; ++i) {
+    double z = i - .5 * m, x = z * M_PI, y = z * mult1;
+    h[i] = x!=0? sin(Fc * x) / x : Fc;
+    h[i] *= lsx_bessel_I_0(beta * sqrt(1 - y * y)) * mult;
+    if (m - i != i)
+      h[m - i] = h[i];
+  }
+  return h;
+}
+
+void lsx_kaiser_params(double att, double Fc, double tr_bw, double * beta, int * num_taps)
+{
+  *beta = *beta < 0? lsx_kaiser_beta(att, tr_bw * .5 / Fc): *beta;
+  att = att < 60? (att - 7.95) / (2.285 * M_PI * 2) :
+    ((.0007528358-1.577737e-05**beta)**beta+.6248022)**beta+.06186902;
+  *num_taps = !*num_taps? (int)ceil(att/tr_bw + 1) : *num_taps;
+}
+
+double * lsx_design_lpf(
+    double Fp,      /* End of pass-band */
+    double Fs,      /* Start of stop-band */
+    double Fn,      /* Nyquist freq; e.g. 0.5, 1, PI */
+    double att,     /* Stop-band attenuation in dB */
+    int * num_taps, /* 0: value will be estimated */
+    int k,          /* >0: number of phases; <0: num_taps = 1 (mod -k) */
+    double beta)    /* <0: value will be estimated */
+{
+  int n = *num_taps, phases = max(k, 1), modulo = max(-k, 1);
+  double tr_bw, Fc, rho = phases == 1? .5 : att < 120? .63 : .75;
+
+  lsx_debug_more("./sinctest %-12.7g %-12.7g %g 0 %-5g %i %i 50 %g %g -4 >1",
+      Fp, Fs, Fn, att, *num_taps, k, beta, rho);
+
+  Fp /= fabs(Fn), Fs /= fabs(Fn);        /* Normalise to Fn = 1 */
+  tr_bw = .5 * (Fs - Fp); /* Transition band-width: 6dB to stop points */
+  tr_bw /= phases, Fs /= phases;
+  tr_bw = min(tr_bw, .5 * Fs);
+  Fc = Fs - tr_bw;
+  assert(Fc - tr_bw >= 0);
+  lsx_kaiser_params(att, Fc, tr_bw, &beta, num_taps);
+  if (!n)
+    *num_taps = phases > 1? *num_taps / phases * phases + phases - 1 :
+      (*num_taps + modulo - 2) / modulo * modulo + 1;
+  return Fn < 0? 0 : lsx_make_lpf(*num_taps, Fc, beta, rho, (double)phases);
+}
+
+static double safe_log(double x)
+{
+  assert(x >= 0);
+  if (x!=0)
+    return log(x);
+  lsx_debug("log(0)");
+  return -26;
+}
+
+void lsx_fir_to_phase(double * * h, int * len, int * post_len, double phase)
+{
+  double * pi_wraps, * work, phase1 = (phase > 50 ? 100 - phase : phase) / 50;
+  int i, work_len, begin, end, imp_peak = 0, peak = 0;
+  double imp_sum = 0, peak_imp_sum = 0;
+  double prev_angle2 = 0, cum_2pi = 0, prev_angle1 = 0, cum_1pi = 0;
+
+  for (i = *len, work_len = 2 * 2 * 8; i > 1; work_len <<= 1, i >>= 1);
+
+  work = calloc((size_t)work_len + 2, sizeof(*work)); /* +2: (UN)PACK */
+  pi_wraps = malloc((((size_t)work_len + 2) / 2) * sizeof(*pi_wraps));
+
+  memcpy(work, *h, (size_t)*len * sizeof(*work));
+  lsx_safe_rdft(work_len, 1, work); /* Cepstral: */
+  LSX_UNPACK(work, work_len);
+
+  for (i = 0; i <= work_len; i += 2) {
+    double angle = atan2(work[i + 1], work[i]);
+    double detect = 2 * M_PI;
+    double delta = angle - prev_angle2;
+    double adjust = detect * ((delta < -detect * .7) - (delta > detect * .7));
+    prev_angle2 = angle;
+    cum_2pi += adjust;
+    angle += cum_2pi;
+    detect = M_PI;
+    delta = angle - prev_angle1;
+    adjust = detect * ((delta < -detect * .7) - (delta > detect * .7));
+    prev_angle1 = angle;
+    cum_1pi += fabs(adjust); /* fabs for when 2pi and 1pi have combined */
+    pi_wraps[i >> 1] = cum_1pi;
+
+    work[i] = safe_log(sqrt(sqr(work[i]) + sqr(work[i + 1])));
+    work[i + 1] = 0;
+  }
+  LSX_PACK(work, work_len);
+  lsx_safe_rdft(work_len, -1, work);
+  for (i = 0; i < work_len; ++i) work[i] *= 2. / work_len;
+
+  for (i = 1; i < work_len / 2; ++i) { /* Window to reject acausal components */
+    work[i] *= 2;
+    work[i + work_len / 2] = 0;
+  }
+  lsx_safe_rdft(work_len, 1, work);
+
+  for (i = 2; i < work_len; i += 2) /* Interpolate between linear & min phase */
+    work[i + 1] = phase1 * i / work_len * pi_wraps[work_len >> 1] +
+        (1 - phase1) * (work[i + 1] + pi_wraps[i >> 1]) - pi_wraps[i >> 1];
+
+  work[0] = exp(work[0]), work[1] = exp(work[1]);
+  for (i = 2; i < work_len; i += 2) {
+    double x = exp(work[i]);
+    work[i    ] = x * cos(work[i + 1]);
+    work[i + 1] = x * sin(work[i + 1]);
+  }
+
+  lsx_safe_rdft(work_len, -1, work);
+  for (i = 0; i < work_len; ++i) work[i] *= 2. / work_len;
+
+  /* Find peak pos. */
+  for (i = 0; i <= (int)(pi_wraps[work_len >> 1] / M_PI + .5); ++i) {
+    imp_sum += work[i];
+    if (fabs(imp_sum) > fabs(peak_imp_sum)) {
+      peak_imp_sum = imp_sum;
+      peak = i;
+    }
+    if (work[i] > work[imp_peak]) /* For debug check only */
+      imp_peak = i;
+  }
+  while (peak && fabs(work[peak-1]) > fabs(work[peak]) && work[peak-1] * work[peak] > 0)
+    --peak;
+
+  if (phase1==0)
+    begin = 0;
+  else if (phase1 == 1)
+    begin = peak - *len / 2;
+  else {
+    begin = (int)((.997 - (2 - phase1) * .22) * *len + .5);
+    end   = (int)((.997 + (0 - phase1) * .22) * *len + .5);
+    begin = peak - (begin & ~3);
+    end   = peak + 1 + ((end + 3) & ~3);
+    *len = end - begin;
+    *h = realloc(*h, (size_t)*len * sizeof(**h));
+  }
+  for (i = 0; i < *len; ++i) (*h)[i] =
+    work[(begin + (phase > 50 ? *len - 1 - i : i) + work_len) & (work_len - 1)];
+  *post_len = phase > 50 ? peak - begin : begin + *len - (peak + 1);
+
+  lsx_debug("nPI=%g peak-sum@%i=%g (val@%i=%g); len=%i post=%i (%g%%)",
+      pi_wraps[work_len >> 1] / M_PI, peak, peak_imp_sum, imp_peak,
+      work[imp_peak], *len, *post_len, 100 - 100. * *post_len / (*len - 1));
+  free(pi_wraps), free(work);
+}
+
+#define F_x(F,expr) static double F(double x) {return expr;}
+F_x(sinePhi, ((2.0517e-07*x-1.1303e-04)*x+.023154)*x+.55924 )
+F_x(sinePsi, ((9.0667e-08*x-5.6114e-05)*x+.013658)*x+1.0977 )
+F_x(sinePow, log(.5)/log(sin(x*.5)) )
+#define dB_to_linear(x) exp((x) * (M_LN10 * 0.05))
+
+double lsx_f_resp(double t, double a)
+{
+  double x;
+  if (t > (a <= 160? .8 : .82)) {
+    double a1 = a+15;
+    double p = .00035*a+.375;
+    double w = 1/(1-.597)*asin(pow((a1-10.6)/a1,1/p));
+    double c = 1+asin(pow(1-a/a1,1/p))/w;
+    return a1*(pow(sin((c-t)*w),p)-1);
+  }
+  if (t > .5)
+    x = sinePsi(a), x = pow(sin((1-t) * x), sinePow(x));
+  else
+    x = sinePhi(a), x = 1 - pow(sin(t * x), sinePow(x));
+  return linear_to_dB(x);
+}
+
+double lsx_inv_f_resp(double drop, double a)
+{
+  double x = sinePhi(a), s;
+  drop = dB_to_linear(drop);
+  s = drop > .5 ? 1 - drop : drop;
+  x = asin(pow(s, 1/sinePow(x))) / x;
+  return drop > .5? x : 1 -x;
+}
diff --git a/src/filter.h b/src/filter.h
new file mode 100644
index 0000000..ccb3ba8
--- /dev/null
+++ b/src/filter.h
@@ -0,0 +1,44 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_filter_included
+#define soxr_filter_included
+
+#include "aliases.h"
+
+double lsx_bessel_I_0(double x);
+void lsx_init_fft_cache(void);
+void lsx_clear_fft_cache(void);
+void lsx_init_fft_cache_f(void);
+void lsx_clear_fft_cache_f(void);
+#define lsx_is_power_of_2(x) !(x < 2 || (x & (x - 1)))
+void lsx_safe_rdft(int len, int type, double * d);
+void lsx_safe_cdft(int len, int type, double * d);
+void lsx_safe_rdft_f(int len, int type, float * d);
+void lsx_safe_cdft_f(int len, int type, float * d);
+void lsx_ordered_convolve(int n, void * not_used, double * a, const double * b);
+void lsx_ordered_convolve_f(int n, void * not_used, float * a, const float * b);
+void lsx_ordered_partial_convolve(int n, double * a, const double * b);
+void lsx_ordered_partial_convolve_f(int n, float * a, const float * b);
+
+double lsx_kaiser_beta(double att, double tr_bw);
+double * lsx_make_lpf(int num_taps, double Fc, double beta, double rho,
+    double scale);
+void lsx_kaiser_params(double att, double Fc, double tr_bw, double * beta, int * num_taps);
+double * lsx_design_lpf(
+    double Fp,      /* End of pass-band */
+    double Fs,      /* Start of stop-band */
+    double Fn,      /* Nyquist freq; e.g. 0.5, 1, PI; < 0: dummy run */
+    double att,     /* Stop-band attenuation in dB */
+    int * num_taps, /* 0: value will be estimated */
+    int k,          /* >0: number of phases; <0: num_taps = 1 (mod -k) */
+    double beta);   /* <0: value will be estimated */
+
+void lsx_fir_to_phase(double * * h, int * len,
+    int * post_len, double phase0);
+
+double lsx_f_resp(double t, double a);
+double lsx_inv_f_resp(double drop, double a);
+#define lsx_to_3dB(a) (1 - lsx_inv_f_resp(-3., a))
+
+#endif
diff --git a/src/half-coefs.h b/src/half-coefs.h
new file mode 100644
index 0000000..a5a0882
--- /dev/null
+++ b/src/half-coefs.h
@@ -0,0 +1,75 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if defined __GNUC__
+  #pragma GCC system_header
+#elif defined __SUNPRO_C
+  #pragma disable_warn
+#elif defined _MSC_VER
+  #pragma warning(push, 1)
+#endif
+
+#if CORE_TYPE & CORE_SIMD_HALF
+  #define VALIGN vAlign
+#else
+  #define VALIGN
+#endif
+
+#if !(CORE_TYPE & CORE_SIMD_HALF)
+static VALIGN const sample_t half_fir_coefs_7[] = {
+ 3.1062656496657370e-01, -8.4998810699955796e-02,  3.4007044621123500e-02,
+-1.2839903789829387e-02,  3.9899380181723145e-03, -8.9355202017945374e-04,
+ 1.0918292424806546e-04,
+};
+#endif
+
+static VALIGN const sample_t half_fir_coefs_8[] = {
+ 3.1154652365332069e-01, -8.7344917685739543e-02,  3.6814458353637280e-02,
+-1.5189204581464479e-02,  5.4540855610738801e-03, -1.5643862626630416e-03,
+ 3.1816575906323303e-04, -3.4799449225005688e-05,
+};
+
+static VALIGN const sample_t half_fir_coefs_9[] = {
+ 3.1227034755311189e-01, -8.9221517147969526e-02,  3.9139704015071934e-02,
+-1.7250558515852023e-02,  6.8589440230476112e-03, -2.3045049636430419e-03,
+ 6.0963740543348963e-04, -1.1323803957431231e-04,  1.1197769991000046e-05,
+};
+
+#if CORE_TYPE & CORE_DBL
+static VALIGN const sample_t half_fir_coefs_10[] = {
+ 3.1285456012000523e-01, -9.0756740799292787e-02,  4.1096398104193160e-02,
+-1.9066319572525220e-02,  8.1840569787684902e-03, -3.0766876176359834e-03,
+ 9.6396524429277980e-04, -2.3585679989922018e-04,  4.0252189026627833e-05,
+-3.6298196342497932e-06,
+};
+
+static VALIGN const sample_t half_fir_coefs_11[] = {
+ 3.1333588822574199e-01, -9.2035898673019811e-02,  4.2765169698406408e-02,
+-2.0673580894964429e-02,  9.4225426824512421e-03, -3.8563379950013192e-03,
+ 1.3634742159642453e-03, -3.9874150714431009e-04,  9.0586723632664806e-05,
+-1.4285617244076783e-05,  1.1834642946400529e-06,
+};
+
+static VALIGN const sample_t half_fir_coefs_12[] = {
+ 3.1373928463345568e-01, -9.3118180335301962e-02,  4.4205005881659098e-02,
+-2.2103860986973051e-02,  1.0574689371162864e-02, -4.6276428065385065e-03,
+ 1.7936153397572132e-03, -5.9617527051353237e-04,  1.6314517495669067e-04,
+-3.4555126770115446e-05,  5.0617615610782593e-06, -3.8768958592971409e-07,
+};
+
+static VALIGN const sample_t half_fir_coefs_13[] = {
+ 3.1408224847888910e-01, -9.4045836332667387e-02,  4.5459878763259978e-02,
+-2.3383369012219993e-02,  1.1644273044890753e-02, -5.3806714579057013e-03,
+ 2.2429072878264022e-03, -8.2204347506606424e-04,  2.5724946477840893e-04,
+-6.6072709864248668e-05,  1.3099163296288644e-05, -1.7907147069136000e-06,
+ 1.2750825595240592e-07,
+};
+#endif
+
+#undef VALIGN
+
+#if defined __SUNPRO_C
+  #pragma enable_warn
+#elif defined _MSC_VER
+  #pragma warning(pop)
+#endif
diff --git a/src/half-fir.h b/src/half-fir.h
new file mode 100644
index 0000000..782be1b
--- /dev/null
+++ b/src/half-fir.h
@@ -0,0 +1,61 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Decimate by 2 using a FIR with odd length (LEN). */
+/* Input must be preceded and followed by LEN >> 1 samples. */
+
+#define COEFS ((sample_t const *)p->coefs)
+
+#if SIMD_SSE
+  #define BEGINNING v4_t sum, q1, q2, t
+  #define ____ \
+    q1 = _mm_shuffle_ps(t=vLdu(input+2*j),vLdu(input+2*j+4),_MM_SHUFFLE(3,1,3,1)); \
+    q2 = _mm_shuffle_ps(vLdu(input-2*j-4),vLdu(input-2*j-8),_MM_SHUFFLE(1,3,1,3)); \
+    sum = vAdd(j? sum : vMul(vSet1(.5), t), vMul(vAdd(q1, q2), vLd(COEFS+j))); \
+    j += 4;
+  #define __ \
+    q1 = _mm_shuffle_ps(vLdu(input+2*j), vLdu(input-2*j-4), _MM_SHUFFLE(1,3,3,1)); \
+    q2 = _mm_loadl_pi(q2, (__m64*)(COEFS+j)), q2 = _mm_movelh_ps(q2, q2); \
+    sum = vAdd(sum, vMul(q1, q2)); \
+    j += 2;
+  #define _ \
+    q1 = _mm_add_ss(_mm_load_ss(input+2*j+1), _mm_load_ss(input-2*j-1)); \
+    sum = _mm_add_ss(sum, _mm_mul_ss(q1, _mm_load_ss(COEFS+j))); \
+    ++j;
+  #define END vStorSum(output+i, sum)
+/* #elif SIMD_AVX; No good solution found. */
+/* #elif SIMD_NEON; No need: gcc -O3 does a good job by itself. */
+#else
+  #define BEGINNING sample_t sum = input[0] * .5f
+  #define ____ __ __
+  #define __ _ _
+  #define _ sum += (input[-(2*j +1)] + input[(2*j +1)]) * COEFS[j], ++j;
+  #define END output[i] = sum
+#endif
+
+
+
+static void FUNCTION_H(stage_t * p, fifo_t * output_fifo)
+{
+  sample_t const * __restrict input = stage_read_p(p);
+  int num_in = min(stage_occupancy(p), p->input_size);
+  int i, num_out = (num_in + 1) >> 1;
+  sample_t * __restrict output = fifo_reserve(output_fifo, num_out);
+
+  for (i = 0; i < num_out; ++i, input += 2) {
+    int j = 0;
+    BEGINNING; CONVOLVE; END;
+  }
+  fifo_read(&p->fifo, 2 * num_out, NULL);
+}
+
+
+
+#undef _
+#undef __
+#undef ____
+#undef BEGINNING
+#undef END
+#undef COEFS
+#undef CONVOLVE
+#undef FUNCTION_H
diff --git a/src/internal.h b/src/internal.h
new file mode 100644
index 0000000..08924d5
--- /dev/null
+++ b/src/internal.h
@@ -0,0 +1,84 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_internal_included
+#define soxr_internal_included
+
+#include "std-types.h"
+
+
+
+#undef min
+#undef max
+#define min(a, b) ((a) <= (b) ? (a) : (b))
+#define max(a, b) ((a) >= (b) ? (a) : (b))
+
+
+
+#define range_limit(x, lower, upper) (min(max(x, lower), upper))
+#define linear_to_dB(x) (log10(x) * 20)
+#define array_length(a) (sizeof(a)/sizeof(a[0]))
+#if !defined AL
+#define AL(a) array_length(a)
+#endif
+#define iAL(a) (int)AL(a)
+#define sqr(a) ((a) * (a))
+
+
+
+#if defined __GNUC__
+  #define UNUSED __attribute__ ((unused))
+#else
+  #define UNUSED
+#endif
+
+
+
+#if !WITH_DEV_TRACE
+  #ifdef __GNUC__
+    void lsx_dummy(char const *, ...);
+  #else
+    static __inline void lsx_dummy(char const * x, ...) {}
+  #endif
+  #define lsx_debug if(0) lsx_dummy
+  #define lsx_debug_more lsx_debug
+#else
+  extern int _soxr_trace_level;
+  void _soxr_trace(char const * fmt, ...);
+  #define lsx_debug      if (_soxr_trace_level > 0) _soxr_trace
+  #define lsx_debug_more if (_soxr_trace_level > 1) _soxr_trace
+#endif
+
+
+
+/* soxr_quality_spec_t.flags: */
+
+#define SOXR_ROLLOFF_LSR2Q     3u    /* Reserved for internal use. */
+#define SOXR_ROLLOFF_MASK      3u    /* For masking these bits. */
+#define SOXR_MAINTAIN_3DB_PT   4u    /* Reserved for internal use. */
+#define SOXR_PROMOTE_TO_LQ    64u    /* Reserved for internal use. */
+
+
+
+/* soxr_runtime_spec_t.flags: */
+
+#define SOXR_STRICT_BUFFERING  4u    /* Reserved for future use. */
+#define SOXR_NOSMALLINTOPT     8u    /* For test purposes only. */
+
+
+
+/* soxr_quality_spec recipe: */
+
+#define SOXR_PRECISIONQ         11   /* Quality specified by the precision parameter. */
+
+#define SOXR_PHASE_MASK         0x30 /* For masking these bits. */
+
+
+
+/* soxr_quality_spec flags: */
+
+#define RESET_ON_CLEAR   (1u<<31)
+
+
+
+#endif
diff --git a/src/math-wrap.h b/src/math-wrap.h
new file mode 100644
index 0000000..8a526f1
--- /dev/null
+++ b/src/math-wrap.h
@@ -0,0 +1,31 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_math_wrap_included
+#define soxr_math_wrap_included
+
+#include <math.h>
+
+#if defined __STRICT_ANSI__
+  #define sinf(x)  (float)sin ((double)(x))
+  #define cosf(x)  (float)cos ((double)(x))
+  #define atanf(x) (float)atan((double)(x))
+#endif
+
+#if !defined M_PI
+  #define M_PI    3.141592653589793238462643383279502884
+#endif
+
+#if !defined M_LN10
+  #define M_LN10  2.302585092994045684017991454684364208
+#endif
+
+#if !defined M_SQRT2
+  #define M_SQRT2 1.414213562373095048801688724209698079
+#endif
+
+#if !defined M_LN2
+  #define M_LN2   0.693147180559945309417232121458176568
+#endif
+
+#endif
diff --git a/src/pffft-avx.h b/src/pffft-avx.h
new file mode 100644
index 0000000..ace19b5
--- /dev/null
+++ b/src/pffft-avx.h
@@ -0,0 +1,40 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* AVX support macros */
+
+#if !defined soxr_avx_included
+#define soxr_avx_included
+
+#include <immintrin.h>
+
+typedef __m256d v4sf;
+#define VZERO() _mm256_setzero_pd()
+#define VMUL(a,b) _mm256_mul_pd(a,b)
+#define VADD(a,b) _mm256_add_pd(a,b)
+#define VMADD(a,b,c) VADD(VMUL(a,b),c) /* Note: gcc -mfma will `fuse' these */
+#define VSUB(a,b) _mm256_sub_pd(a,b)
+#define LD_PS1(p) _mm256_set1_pd(p)
+#define INTERLEAVE2(in1, in2, out1, out2) {v4sf \
+  t1 = _mm256_unpacklo_pd(in1, in2), \
+  t2 = _mm256_unpackhi_pd(in1, in2); \
+  out1 = _mm256_permute2f128_pd(t1,t2,0x20); \
+  out2 = _mm256_permute2f128_pd(t1,t2,0x31); }
+#define UNINTERLEAVE2(in1, in2, out1, out2) {v4sf \
+  t1 = _mm256_permute2f128_pd(in1,in2,0x20), \
+  t2 = _mm256_permute2f128_pd(in1,in2,0x31); \
+  out1 = _mm256_unpacklo_pd(t1, t2); \
+  out2 = _mm256_unpackhi_pd(t1, t2);}
+#define VTRANSPOSE4(x0,x1,x2,x3) {v4sf \
+  t0 = _mm256_shuffle_pd(x0,x1, 0x0), \
+  t2 = _mm256_shuffle_pd(x0,x1, 0xf), \
+  t1 = _mm256_shuffle_pd(x2,x3, 0x0), \
+  t3 = _mm256_shuffle_pd(x2,x3, 0xf); \
+  x0 = _mm256_permute2f128_pd(t0,t1, 0x20); \
+  x1 = _mm256_permute2f128_pd(t2,t3, 0x20); \
+  x2 = _mm256_permute2f128_pd(t0,t1, 0x31); \
+  x3 = _mm256_permute2f128_pd(t2,t3, 0x31);}
+#define VSWAPHL(a,b) _mm256_permute2f128_pd(b, a, 0x30)
+#define VALIGNED(ptr) ((((long)(ptr)) & 0x1F) == 0)
+
+#endif
diff --git a/src/pffft-wrap.c b/src/pffft-wrap.c
new file mode 100644
index 0000000..c920f06
--- /dev/null
+++ b/src/pffft-wrap.c
@@ -0,0 +1,110 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined PFFT_MACROS_ONLY
+
+#include "math-wrap.h"
+
+#if PFFFT_DOUBLE
+  #include "util64s.h"
+#else
+  #include "util32s.h"
+  #define sin(x) sinf(x)
+  #define cos(x) cosf(x)
+#endif
+
+#define pffft_aligned_free    SIMD_ALIGNED_FREE
+#define pffft_aligned_malloc  SIMD_ALIGNED_MALLOC
+#define pffft_aligned_calloc  SIMD_ALIGNED_CALLOC
+
+#undef inline
+#define inline __inline
+
+#endif
+
+
+
+#include "pffft.c"
+
+
+
+#if !defined PFFT_MACROS_ONLY
+
+#if !defined PFFFT_SIMD_DISABLE
+
+static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) {
+  int i, Ncvec = s->Ncvec;
+  const v4sf * /*RESTRICT*/ va = (const v4sf*)a;
+  const v4sf * RESTRICT vb = (const v4sf*)b;
+  v4sf * /*RESTRICT*/ vab = (v4sf*)ab;
+
+  float ar, ai, br, bi;
+
+#ifdef __arm__
+  __builtin_prefetch(va);
+  __builtin_prefetch(vb);
+  __builtin_prefetch(va+2);
+  __builtin_prefetch(vb+2);
+  __builtin_prefetch(va+4);
+  __builtin_prefetch(vb+4);
+  __builtin_prefetch(va+6);
+  __builtin_prefetch(vb+6);
+#endif
+
+  assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
+  ar = ((v4sf_union*)va)[0].f[0];
+  ai = ((v4sf_union*)va)[1].f[0];
+  br = ((v4sf_union*)vb)[0].f[0];
+  bi = ((v4sf_union*)vb)[1].f[0];
+
+  for (i=0; i < Ncvec; i += 2) {
+    v4sf ar, ai, br, bi;
+    ar = va[2*i+0]; ai = va[2*i+1];
+    br = vb[2*i+0]; bi = vb[2*i+1];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+0] = ar;
+    vab[2*i+1] = ai;
+    ar = va[2*i+2]; ai = va[2*i+3];
+    br = vb[2*i+2]; bi = vb[2*i+3];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+2] = ar;
+    vab[2*i+3] = ai;
+  }
+  if (s->transform == PFFFT_REAL) {
+    ((v4sf_union*)vab)[0].f[0] = ar*br;
+    ((v4sf_union*)vab)[1].f[0] = ai*bi;
+  }
+}
+
+#else
+
+static void pffft_zconvolve(PFFFT_Setup *s, const float *a, const float *b, float *ab) {
+  int i, Ncvec = s->Ncvec;
+
+  if (s->transform == PFFFT_REAL) {
+    /* take care of the fftpack ordering */
+    ab[0] = a[0]*b[0];
+    ab[2*Ncvec-1] = a[2*Ncvec-1]*b[2*Ncvec-1];
+    ++ab; ++a; ++b; --Ncvec;
+  }
+  for (i=0; i < Ncvec; ++i) {
+    float ar, ai, br, bi;
+    ar = a[2*i+0]; ai = a[2*i+1];
+    br = b[2*i+0]; bi = b[2*i+1];
+    VCPLXMUL(ar, ai, br, bi);
+    ab[2*i+0] = ar;
+    ab[2*i+1] = ai;
+  }
+}
+
+#endif
+
+#include <string.h>
+
+static void pffft_reorder_back(int length, void * setup, float * data, float * work)
+{
+  memcpy(work, data, (unsigned)length * sizeof(*work));
+  pffft_zreorder(setup, work, data, PFFFT_BACKWARD);
+}
+
+#endif
diff --git a/src/pffft.c b/src/pffft.c
new file mode 100644
index 0000000..46c841e
--- /dev/null
+++ b/src/pffft.c
@@ -0,0 +1,1946 @@
+/* https://bitbucket.org/jpommier/pffft/raw/483453d8f7661058e74aa4e7cf5c27bcd7887e7a/pffft.c
+ * with minor changes for libsoxr. */
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB
+   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
+   of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+
+
+   PFFFT : a Pretty Fast FFT.
+
+   This file is largerly based on the original FFTPACK implementation, modified in
+   order to take advantage of SIMD instructions of modern CPUs.
+*/
+
+/*
+  ChangeLog:
+  - 2011/10/02, version 1: This is the very first release of this file.
+*/
+
+#include "pffft.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+
+/* detect compiler flavour */
+#if defined(_MSC_VER)
+#  define COMPILER_MSVC
+#elif defined(__GNUC__)
+#  define COMPILER_GCC
+#endif
+
+#if defined(COMPILER_GCC)
+#  define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+#  define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
+#elif defined(COMPILER_MSVC)
+#  define ALWAYS_INLINE(return_type) __forceinline return_type
+#  define NEVER_INLINE(return_type) __declspec(noinline) return_type
+#  define RESTRICT __restrict
+#  define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
+#endif
+
+
+/*
+   vector support macros: the rest of the code is independant of
+   SSE/Altivec/NEON -- adding support for other platforms with 4-element
+   vectors should be limited to these macros
+*/
+
+
+/* define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code */
+/*#define PFFFT_SIMD_DISABLE */
+
+/*
+   Altivec support macros
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
+typedef vector float v4sf;
+#  define SIMD_SZ 4
+#  define VZERO() ((vector float) vec_splat_u8(0))
+#  define VMUL(a,b) vec_madd(a,b, VZERO())
+#  define VADD(a,b) vec_add(a,b)
+#  define VMADD(a,b,c) vec_madd(a,b,c)
+#  define VSUB(a,b) vec_sub(a,b)
+inline v4sf ld_ps1(const float *p) { v4sf v=vec_lde(0,p); return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0); }
+#  define LD_PS1(p) ld_ps1(&p)
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
+    vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
+    vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
+    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
+  }
+#  define VTRANSPOSE4(x0,x1,x2,x3) {              \
+    v4sf y0 = vec_mergeh(x0, x2);               \
+    v4sf y1 = vec_mergel(x0, x2);               \
+    v4sf y2 = vec_mergeh(x1, x3);               \
+    v4sf y3 = vec_mergel(x1, x3);               \
+    x0 = vec_mergeh(y0, y2);                    \
+    x1 = vec_mergel(y0, y2);                    \
+    x2 = vec_mergeh(y1, y3);                    \
+    x3 = vec_mergel(y1, y3);                    \
+  }
+#  define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
+#  define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
+
+/*
+  SSE1 support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(i386) || defined(_M_IX86))
+
+#  define SIMD_SZ 4 /* 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors. */
+
+#if !PFFFT_DOUBLE
+#include <xmmintrin.h>
+typedef __m128 v4sf;
+#  define VZERO() _mm_setzero_ps()
+#  define VMUL(a,b) _mm_mul_ps(a,b)
+#  define VADD(a,b) _mm_add_ps(a,b)
+#  define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
+#  define VSUB(a,b) _mm_sub_ps(a,b)
+#  define LD_PS1(p) _mm_set1_ps(p)
+#  define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
+#  define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
+#  define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
+#  define VALIGNED(ptr) ((((long)(ptr)) & 0xF) == 0)
+
+#else
+#include "pffft-avx.h"
+#endif
+
+/*
+  ARM NEON support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && defined(__arm__)
+#  include <arm_neon.h>
+typedef float32x4_t v4sf;
+#  define SIMD_SZ 4
+#  define VZERO() vdupq_n_f32(0)
+#  define VMUL(a,b) vmulq_f32(a,b)
+#  define VADD(a,b) vaddq_f32(a,b)
+#  define VMADD(a,b,c) vmlaq_f32(c,a,b)
+#  define VSUB(a,b) vsubq_f32(a,b)
+#  define LD_PS1(p) vld1q_dup_f32(&(p))
+#  define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#  define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
+    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
+    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
+    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
+    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
+    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
+  }
+/* marginally faster version */
+/*#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); } */
+#  define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
+#  define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
+#else
+#  if !defined(PFFFT_SIMD_DISABLE)
+#    warning "building with simd disabled !\n";
+#    define PFFFT_SIMD_DISABLE /* fallback to scalar code */
+#  endif
+#endif
+
+#if PFFFT_DOUBLE
+#define float double
+#endif
+
+/* fallback mode for situations where SSE/Altivec are not available, use scalar mode instead */
+#ifdef PFFFT_SIMD_DISABLE
+typedef float v4sf;
+#  define SIMD_SZ 1
+#  define VZERO() 0.f
+#  define VMUL(a,b) ((a)*(b))
+#  define VADD(a,b) ((a)+(b))
+#  define VMADD(a,b,c) ((a)*(b)+(c))
+#  define VSUB(a,b) ((a)-(b))
+#  define LD_PS1(p) (p)
+#  define VALIGNED(ptr) ((((long)(ptr)) & 0x3) == 0)
+#endif
+
+/* shortcuts for complex multiplcations */
+#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
+#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
+#ifndef SVMUL
+/* multiply a scalar with a vector */
+#define SVMUL(f,v) VMUL(LD_PS1(f),v)
+#endif
+
+#if !defined PFFT_MACROS_ONLY
+
+#if !defined(PFFFT_SIMD_DISABLE)
+typedef union v4sf_union {
+  v4sf  v;
+  float f[4];
+} v4sf_union;
+
+#if 0
+#include <string.h>
+
+#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
+
+/* detect bugs with the vector support macros */
+void validate_pffft_simd(void);
+void validate_pffft_simd(void) {
+  float f[16] = { 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 };
+  v4sf_union a0, a1, a2, a3, t, u;
+  memcpy(a0.f, f, 4*sizeof(float));
+  memcpy(a1.f, f+4, 4*sizeof(float));
+  memcpy(a2.f, f+8, 4*sizeof(float));
+  memcpy(a3.f, f+12, 4*sizeof(float));
+
+  t = a0; u = a1; t.v = VZERO();
+  printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 0, 0, 0, 0);
+  t.v = VADD(a1.v, a2.v);
+  printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 12, 14, 16, 18);
+  t.v = VMUL(a1.v, a2.v);
+  printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 45, 60, 77);
+  t.v = VMADD(a1.v, a2.v,a0.v);
+  printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]); assertv4(t, 32, 46, 62, 80);
+  INTERLEAVE2(a1.v,a2.v,t.v,u.v);
+  printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+  assertv4(t, 4, 8, 5, 9); assertv4(u, 6, 10, 7, 11);
+  UNINTERLEAVE2(a1.v,a2.v,t.v,u.v);
+  printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+  assertv4(t, 4, 6, 8, 10); assertv4(u, 5, 7, 9, 11);
+
+  t.v=LD_PS1(f[15]);
+  printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+  assertv4(t, 15, 15, 15, 15);
+  t.v = VSWAPHL(a1.v, a2.v);
+  printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+  assertv4(t, 8, 9, 6, 7);
+  VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
+  printf("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
+         a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2], a1.f[3],
+         a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1], a3.f[2], a3.f[3]);
+  assertv4(a0, 0, 4, 8, 12); assertv4(a1, 1, 5, 9, 13); assertv4(a2, 2, 6, 10, 14); assertv4(a3, 3, 7, 11, 15);
+}
+#endif
+#endif /*!PFFFT_SIMD_DISABLE */
+
+#if 0
+/* SSE and co like 16-bytes aligned pointers */
+#define MALLOC_V4SF_ALIGNMENT 64 /* with a 64-byte alignment, we are even aligned on L2 cache lines... */
+void *pffft_aligned_malloc(size_t nb_bytes) {
+  void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
+  if (!p0) return (void *) 0;
+  p = (void *) (((size_t) p0 + MALLOC_V4SF_ALIGNMENT) & (~((size_t) (MALLOC_V4SF_ALIGNMENT-1))));
+  *((void **) p - 1) = p0;
+  return p;
+}
+
+void pffft_aligned_free(void *p) {
+  if (p) free(*((void **) p - 1));
+}
+
+int pffft_simd_size() { return SIMD_SZ; }
+#endif
+
+/*
+  passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
+*/
+static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1, float fsign) {
+  int k, i;
+  int l1ido = l1*ido;
+  if (ido <= 2) {
+    for (k=0; k < l1ido; k += ido, ch += ido, cc+= 2*ido) {
+      ch[0]         = VADD(cc[0], cc[ido+0]);
+      ch[l1ido]     = VSUB(cc[0], cc[ido+0]);
+      ch[1]         = VADD(cc[1], cc[ido+1]);
+      ch[l1ido + 1] = VSUB(cc[1], cc[ido+1]);
+    }
+  } else {
+    for (k=0; k < l1ido; k += ido, ch += ido, cc += 2*ido) {
+      for (i=0; i<ido-1; i+=2) {
+        v4sf tr2 = VSUB(cc[i+0], cc[i+ido+0]);
+        v4sf ti2 = VSUB(cc[i+1], cc[i+ido+1]);
+        v4sf wr = LD_PS1(wa1[i]), wi = VMUL(LD_PS1(fsign), LD_PS1(wa1[i+1]));
+        ch[i]   = VADD(cc[i+0], cc[i+ido+0]);
+        ch[i+1] = VADD(cc[i+1], cc[i+ido+1]);
+        VCPLXMUL(tr2, ti2, wr, wi);
+        ch[i+l1ido]   = tr2;
+        ch[i+l1ido+1] = ti2;
+      }
+    }
+  }
+}
+
+/*
+  passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
+*/
+#if 0
+static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
+                                    const float *wa1, const float *wa2, float fsign) {
+  static const float taur = -0.5f;
+  float taui = 0.866025403784439f*fsign;
+  int i, k;
+  v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
+  int l1ido = l1*ido;
+  float wr1, wi1, wr2, wi2;
+  assert(ido > 2);
+  for (k=0; k< l1ido; k += ido, cc+= 3*ido, ch +=ido) {
+    for (i=0; i<ido-1; i+=2) {
+      tr2 = VADD(cc[i+ido], cc[i+2*ido]);
+      cr2 = VADD(cc[i], SVMUL(taur,tr2));
+      ch[i]    = VADD(cc[i], tr2);
+      ti2 = VADD(cc[i+ido+1], cc[i+2*ido+1]);
+      ci2 = VADD(cc[i    +1], SVMUL(taur,ti2));
+      ch[i+1]  = VADD(cc[i+1], ti2);
+      cr3 = SVMUL(taui, VSUB(cc[i+ido], cc[i+2*ido]));
+      ci3 = SVMUL(taui, VSUB(cc[i+ido+1], cc[i+2*ido+1]));
+      dr2 = VSUB(cr2, ci3);
+      dr3 = VADD(cr2, ci3);
+      di2 = VADD(ci2, cr3);
+      di3 = VSUB(ci2, cr3);
+      wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
+      VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+      ch[i+l1ido] = dr2;
+      ch[i+l1ido + 1] = di2;
+      VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+      ch[i+2*l1ido] = dr3;
+      ch[i+2*l1ido+1] = di3;
+    }
+  }
+} /* passf3 */
+#endif
+
+static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
+                                    const float *wa1, const float *wa2, const float *wa3, float fsign) {
+  /* isign == -1 for forward transform and +1 for backward transform */
+
+  int i, k;
+  v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+  int l1ido = l1*ido;
+  if (ido == 2) {
+    for (k=0; k < l1ido; k += ido, ch += ido, cc += 4*ido) {
+      tr1 = VSUB(cc[0], cc[2*ido + 0]);
+      tr2 = VADD(cc[0], cc[2*ido + 0]);
+      ti1 = VSUB(cc[1], cc[2*ido + 1]);
+      ti2 = VADD(cc[1], cc[2*ido + 1]);
+      ti4 = VMUL(VSUB(cc[1*ido + 0], cc[3*ido + 0]), LD_PS1(fsign));
+      tr4 = VMUL(VSUB(cc[3*ido + 1], cc[1*ido + 1]), LD_PS1(fsign));
+      tr3 = VADD(cc[ido + 0], cc[3*ido + 0]);
+      ti3 = VADD(cc[ido + 1], cc[3*ido + 1]);
+
+      ch[0*l1ido + 0] = VADD(tr2, tr3);
+      ch[0*l1ido + 1] = VADD(ti2, ti3);
+      ch[1*l1ido + 0] = VADD(tr1, tr4);
+      ch[1*l1ido + 1] = VADD(ti1, ti4);
+      ch[2*l1ido + 0] = VSUB(tr2, tr3);
+      ch[2*l1ido + 1] = VSUB(ti2, ti3);
+      ch[3*l1ido + 0] = VSUB(tr1, tr4);
+      ch[3*l1ido + 1] = VSUB(ti1, ti4);
+    }
+  } else {
+    for (k=0; k < l1ido; k += ido, ch+=ido, cc += 4*ido) {
+      for (i=0; i<ido-1; i+=2) {
+        float wr1, wi1, wr2, wi2, wr3, wi3;
+        tr1 = VSUB(cc[i + 0], cc[i + 2*ido + 0]);
+        tr2 = VADD(cc[i + 0], cc[i + 2*ido + 0]);
+        ti1 = VSUB(cc[i + 1], cc[i + 2*ido + 1]);
+        ti2 = VADD(cc[i + 1], cc[i + 2*ido + 1]);
+        tr4 = VMUL(VSUB(cc[i + 3*ido + 1], cc[i + 1*ido + 1]), LD_PS1(fsign));
+        ti4 = VMUL(VSUB(cc[i + 1*ido + 0], cc[i + 3*ido + 0]), LD_PS1(fsign));
+        tr3 = VADD(cc[i + ido + 0], cc[i + 3*ido + 0]);
+        ti3 = VADD(cc[i + ido + 1], cc[i + 3*ido + 1]);
+
+        ch[i] = VADD(tr2, tr3);
+        cr3    = VSUB(tr2, tr3);
+        ch[i + 1] = VADD(ti2, ti3);
+        ci3 = VSUB(ti2, ti3);
+
+        cr2 = VADD(tr1, tr4);
+        cr4 = VSUB(tr1, tr4);
+        ci2 = VADD(ti1, ti4);
+        ci4 = VSUB(ti1, ti4);
+        wr1=wa1[i], wi1=fsign*wa1[i+1];
+        VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
+        wr2=wa2[i], wi2=fsign*wa2[i+1];
+        ch[i + l1ido] = cr2;
+        ch[i + l1ido + 1] = ci2;
+
+        VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
+        wr3=wa3[i], wi3=fsign*wa3[i+1];
+        ch[i + 2*l1ido] = cr3;
+        ch[i + 2*l1ido + 1] = ci3;
+
+        VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
+        ch[i + 3*l1ido] = cr4;
+        ch[i + 3*l1ido + 1] = ci4;
+      }
+    }
+  }
+} /* passf4 */
+
+#if 0
+/*
+  passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
+*/
+static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf *cc, v4sf *ch,
+                                    const float *wa1, const float *wa2,
+                                    const float *wa3, const float *wa4, float fsign) {
+  static const float tr11 = .309016994374947f;
+  const float ti11 = .951056516295154f*fsign;
+  static const float tr12 = -.809016994374947f;
+  const float ti12 = .587785252292473f*fsign;
+
+  /* Local variables */
+  int i, k;
+  v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
+    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+
+  float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
+
+#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
+#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
+
+  assert(ido > 2);
+  for (k = 0; k < l1; ++k, cc += 5*ido, ch += ido) {
+    for (i = 0; i < ido-1; i += 2) {
+      ti5 = VSUB(cc_ref(i  , 2), cc_ref(i  , 5));
+      ti2 = VADD(cc_ref(i  , 2), cc_ref(i  , 5));
+      ti4 = VSUB(cc_ref(i  , 3), cc_ref(i  , 4));
+      ti3 = VADD(cc_ref(i  , 3), cc_ref(i  , 4));
+      tr5 = VSUB(cc_ref(i-1, 2), cc_ref(i-1, 5));
+      tr2 = VADD(cc_ref(i-1, 2), cc_ref(i-1, 5));
+      tr4 = VSUB(cc_ref(i-1, 3), cc_ref(i-1, 4));
+      tr3 = VADD(cc_ref(i-1, 3), cc_ref(i-1, 4));
+      ch_ref(i-1, 1) = VADD(cc_ref(i-1, 1), VADD(tr2, tr3));
+      ch_ref(i  , 1) = VADD(cc_ref(i  , 1), VADD(ti2, ti3));
+      cr2 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr11, tr2),SVMUL(tr12, tr3)));
+      ci2 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr11, ti2),SVMUL(tr12, ti3)));
+      cr3 = VADD(cc_ref(i-1, 1), VADD(SVMUL(tr12, tr2),SVMUL(tr11, tr3)));
+      ci3 = VADD(cc_ref(i  , 1), VADD(SVMUL(tr12, ti2),SVMUL(tr11, ti3)));
+      cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
+      ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+      cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
+      ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+      dr3 = VSUB(cr3, ci4);
+      dr4 = VADD(cr3, ci4);
+      di3 = VADD(ci3, cr4);
+      di4 = VSUB(ci3, cr4);
+      dr5 = VADD(cr2, ci5);
+      dr2 = VSUB(cr2, ci5);
+      di5 = VSUB(ci2, cr5);
+      di2 = VADD(ci2, cr5);
+      wr1=wa1[i], wi1=fsign*wa1[i+1], wr2=wa2[i], wi2=fsign*wa2[i+1];
+      wr3=wa3[i], wi3=fsign*wa3[i+1], wr4=wa4[i], wi4=fsign*wa4[i+1];
+      VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+      ch_ref(i - 1, 2) = dr2;
+      ch_ref(i, 2)     = di2;
+      VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+      ch_ref(i - 1, 3) = dr3;
+      ch_ref(i, 3)     = di3;
+      VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
+      ch_ref(i - 1, 4) = dr4;
+      ch_ref(i, 4)     = di4;
+      VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
+      ch_ref(i - 1, 5) = dr5;
+      ch_ref(i, 5)     = di5;
+    }
+  }
+#undef ch_ref
+#undef cc_ref
+}
+#endif
+
+static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch, const float *wa1) {
+  static const float minus_one = -1.f;
+  int i, k, l1ido = l1*ido;
+  for (k=0; k < l1ido; k += ido) {
+    v4sf a = cc[k], b = cc[k + l1ido];
+    ch[2*k] = VADD(a, b);
+    ch[2*(k+ido)-1] = VSUB(a, b);
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k=0; k < l1ido; k += ido) {
+      for (i=2; i<ido; i+=2) {
+        v4sf tr2 = cc[i - 1 + k + l1ido], ti2 = cc[i + k + l1ido];
+        v4sf br = cc[i - 1 + k], bi = cc[i + k];
+        VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+        ch[i + 2*k] = VADD(bi, ti2);
+        ch[2*(k+ido) - i] = VSUB(ti2, bi);
+        ch[i - 1 + 2*k] = VADD(br, tr2);
+        ch[2*(k+ido) - i -1] = VSUB(br, tr2);
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k=0; k < l1ido; k += ido) {
+    ch[2*k + ido] = SVMUL(minus_one, cc[ido-1 + k + l1ido]);
+    ch[2*k + ido-1] = cc[k + ido-1];
+  }
+} /* radf2 */
+
+
+static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf *cc, v4sf *ch, const float *wa1) {
+  static const float minus_two=-2;
+  int i, k, l1ido = l1*ido;
+  v4sf a,b,c,d, tr2, ti2;
+  for (k=0; k < l1ido; k += ido) {
+    a = cc[2*k]; b = cc[2*(k+ido) - 1];
+    ch[k] = VADD(a, b);
+    ch[k + l1ido] =VSUB(a, b);
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k = 0; k < l1ido; k += ido) {
+      for (i = 2; i < ido; i += 2) {
+        a = cc[i-1 + 2*k]; b = cc[2*(k + ido) - i - 1];
+        c = cc[i+0 + 2*k]; d = cc[2*(k + ido) - i + 0];
+        ch[i-1 + k] = VADD(a, b);
+        tr2 = VSUB(a, b);
+        ch[i+0 + k] = VSUB(c, d);
+        ti2 = VADD(c, d);
+        VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]), LD_PS1(wa1[i - 1]));
+        ch[i-1 + k + l1ido] = tr2;
+        ch[i+0 + k + l1ido] = ti2;
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k = 0; k < l1ido; k += ido) {
+    a = cc[2*k + ido-1]; b = cc[2*k + ido];
+    ch[k + ido-1] = VADD(a,a);
+    ch[k + ido-1 + l1ido] = SVMUL(minus_two, b);
+  }
+} /* radb2 */
+
+#if 0
+static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
+                     const float *wa1, const float *wa2) {
+  static const float taur = -0.5f;
+  static const float taui = 0.866025403784439f;
+  int i, k, ic;
+  v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2, wi2;
+  for (k=0; k<l1; k++) {
+    cr2 = VADD(cc[(k + l1)*ido], cc[(k + 2*l1)*ido]);
+    ch[3*k*ido] = VADD(cc[k*ido], cr2);
+    ch[(3*k+2)*ido] = SVMUL(taui, VSUB(cc[(k + l1*2)*ido], cc[(k + l1)*ido]));
+    ch[ido-1 + (3*k + 1)*ido] = VADD(cc[k*ido], SVMUL(taur, cr2));
+  }
+  if (ido == 1) return;
+  for (k=0; k<l1; k++) {
+    for (i=2; i<ido; i+=2) {
+      ic = ido - i;
+      wr1 = LD_PS1(wa1[i - 2]); wi1 = LD_PS1(wa1[i - 1]);
+      dr2 = cc[i - 1 + (k + l1)*ido]; di2 = cc[i + (k + l1)*ido];
+      VCPLXMULCONJ(dr2, di2, wr1, wi1);
+
+      wr2 = LD_PS1(wa2[i - 2]); wi2 = LD_PS1(wa2[i - 1]);
+      dr3 = cc[i - 1 + (k + l1*2)*ido]; di3 = cc[i + (k + l1*2)*ido];
+      VCPLXMULCONJ(dr3, di3, wr2, wi2);
+
+      cr2 = VADD(dr2, dr3);
+      ci2 = VADD(di2, di3);
+      ch[i - 1 + 3*k*ido] = VADD(cc[i - 1 + k*ido], cr2);
+      ch[i + 3*k*ido] = VADD(cc[i + k*ido], ci2);
+      tr2 = VADD(cc[i - 1 + k*ido], SVMUL(taur, cr2));
+      ti2 = VADD(cc[i + k*ido], SVMUL(taur, ci2));
+      tr3 = SVMUL(taui, VSUB(di2, di3));
+      ti3 = SVMUL(taui, VSUB(dr3, dr2));
+      ch[i - 1 + (3*k + 2)*ido] = VADD(tr2, tr3);
+      ch[ic - 1 + (3*k + 1)*ido] = VSUB(tr2, tr3);
+      ch[i + (3*k + 2)*ido] = VADD(ti2, ti3);
+      ch[ic + (3*k + 1)*ido] = VSUB(ti3, ti2);
+    }
+  }
+} /* radf3 */
+
+
+static void radb3_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+                     const float *wa1, const float *wa2)
+{
+  static const float taur = -0.5f;
+  static const float taui = 0.866025403784439f;
+  static const float taui_2 = 0.866025403784439f*2;
+  int i, k, ic;
+  v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
+  for (k=0; k<l1; k++) {
+    tr2 = cc[ido-1 + (3*k + 1)*ido]; tr2 = VADD(tr2,tr2);
+    cr2 = VMADD(LD_PS1(taur), tr2, cc[3*k*ido]);
+    ch[k*ido] = VADD(cc[3*k*ido], tr2);
+    ci3 = SVMUL(taui_2, cc[(3*k + 2)*ido]);
+    ch[(k + l1)*ido] = VSUB(cr2, ci3);
+    ch[(k + 2*l1)*ido] = VADD(cr2, ci3);
+  }
+  if (ido == 1) return;
+  for (k=0; k<l1; k++) {
+    for (i=2; i<ido; i+=2) {
+      ic = ido - i;
+      tr2 = VADD(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]);
+      cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3*k*ido]);
+      ch[i - 1 + k*ido] = VADD(cc[i - 1 + 3*k*ido], tr2);
+      ti2 = VSUB(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]);
+      ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3*k*ido]);
+      ch[i + k*ido] = VADD(cc[i + 3*k*ido], ti2);
+      cr3 = SVMUL(taui, VSUB(cc[i - 1 + (3*k + 2)*ido], cc[ic - 1 + (3*k + 1)*ido]));
+      ci3 = SVMUL(taui, VADD(cc[i + (3*k + 2)*ido], cc[ic + (3*k + 1)*ido]));
+      dr2 = VSUB(cr2, ci3);
+      dr3 = VADD(cr2, ci3);
+      di2 = VADD(ci2, cr3);
+      di3 = VSUB(ci2, cr3);
+      VCPLXMUL(dr2, di2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+      ch[i - 1 + (k + l1)*ido] = dr2;
+      ch[i + (k + l1)*ido] = di2;
+      VCPLXMUL(dr3, di3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+      ch[i - 1 + (k + 2*l1)*ido] = dr3;
+      ch[i + (k + 2*l1)*ido] = di3;
+    }
+  }
+} /* radb3 */
+#endif
+
+static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf * RESTRICT ch,
+                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float * RESTRICT wa3)
+{
+  static const float minus_hsqt2 = (float)-0.7071067811865475;
+  int i, k, l1ido = l1*ido;
+  {
+    const v4sf *RESTRICT cc_ = cc, * RESTRICT cc_end = cc + l1ido;
+    v4sf * RESTRICT ch_ = ch;
+    while (cc < cc_end) {
+      /* this loop represents between 25% and 40% of total radf4_ps cost ! */
+      v4sf a0 = cc[0], a1 = cc[l1ido];
+      v4sf a2 = cc[2*l1ido], a3 = cc[3*l1ido];
+      v4sf tr1 = VADD(a1, a3);
+      v4sf tr2 = VADD(a0, a2);
+      ch[2*ido-1] = VSUB(a0, a2);
+      ch[2*ido  ] = VSUB(a3, a1);
+      ch[0      ] = VADD(tr1, tr2);
+      ch[4*ido-1] = VSUB(tr2, tr1);
+      cc += ido; ch += 4*ido;
+    }
+    cc = cc_; ch = ch_;
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k = 0; k < l1ido; k += ido) {
+      const v4sf * RESTRICT pc = (v4sf*)(cc + 1 + k);
+      for (i=2; i<ido; i += 2, pc += 2) {
+        int ic = ido - i;
+        v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
+        v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
+
+        cr2 = pc[1*l1ido+0];
+        ci2 = pc[1*l1ido+1];
+        wr=LD_PS1(wa1[i - 2]);
+        wi=LD_PS1(wa1[i - 1]);
+        VCPLXMULCONJ(cr2,ci2,wr,wi);
+
+        cr3 = pc[2*l1ido+0];
+        ci3 = pc[2*l1ido+1];
+        wr = LD_PS1(wa2[i-2]);
+        wi = LD_PS1(wa2[i-1]);
+        VCPLXMULCONJ(cr3, ci3, wr, wi);
+
+        cr4 = pc[3*l1ido];
+        ci4 = pc[3*l1ido+1];
+        wr = LD_PS1(wa3[i-2]);
+        wi = LD_PS1(wa3[i-1]);
+        VCPLXMULCONJ(cr4, ci4, wr, wi);
+
+        /* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
+
+        tr1 = VADD(cr2,cr4);
+        tr4 = VSUB(cr4,cr2);
+        tr2 = VADD(pc[0],cr3);
+        tr3 = VSUB(pc[0],cr3);
+        ch[i - 1 + 4*k] = VADD(tr1,tr2);
+        ch[ic - 1 + 4*k + 3*ido] = VSUB(tr2,tr1); /* at this point tr1 and tr2 can be disposed */
+        ti1 = VADD(ci2,ci4);
+        ti4 = VSUB(ci2,ci4);
+        ch[i - 1 + 4*k + 2*ido] = VADD(ti4,tr3);
+        ch[ic - 1 + 4*k + 1*ido] = VSUB(tr3,ti4); /* dispose tr3, ti4 */
+        ti2 = VADD(pc[1],ci3);
+        ti3 = VSUB(pc[1],ci3);
+        ch[i + 4*k] = VADD(ti1, ti2);
+        ch[ic + 4*k + 3*ido] = VSUB(ti1, ti2);
+        ch[i + 4*k + 2*ido] = VADD(tr4, ti3);
+        ch[ic + 4*k + 1*ido] = VSUB(tr4, ti3);
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k=0; k<l1ido; k += ido) {
+    v4sf a = cc[ido-1 + k + l1ido], b = cc[ido-1 + k + 3*l1ido];
+    v4sf c = cc[ido-1 + k], d = cc[ido-1 + k + 2*l1ido];
+    v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
+    v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
+    ch[ido-1 + 4*k] = VADD(tr1, c);
+    ch[ido-1 + 4*k + 2*ido] = VSUB(c, tr1);
+    ch[4*k + 1*ido] = VSUB(ti1, d);
+    ch[4*k + 3*ido] = VADD(ti1, d);
+  }
+} /* radf4 */
+
+
+static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
+                                   const float * RESTRICT wa1, const float * RESTRICT wa2, const float *RESTRICT wa3)
+{
+  static const float minus_sqrt2 = (float)-1.414213562373095;
+  static const float two = 2.f;
+  int i, k, l1ido = l1*ido;
+  v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3, tr4;
+  {
+    const v4sf *RESTRICT cc_ = cc, * RESTRICT ch_end = ch + l1ido;
+    v4sf *ch_ = ch;
+    while (ch < ch_end) {
+      v4sf a = cc[0], b = cc[4*ido-1];
+      v4sf c = cc[2*ido], d = cc[2*ido-1];
+      tr3 = SVMUL(two,d);
+      tr2 = VADD(a,b);
+      tr1 = VSUB(a,b);
+      tr4 = SVMUL(two,c);
+      ch[0*l1ido] = VADD(tr2, tr3);
+      ch[2*l1ido] = VSUB(tr2, tr3);
+      ch[1*l1ido] = VSUB(tr1, tr4);
+      ch[3*l1ido] = VADD(tr1, tr4);
+
+      cc += 4*ido; ch += ido;
+    }
+    cc = cc_; ch = ch_;
+  }
+  if (ido < 2) return;
+  if (ido != 2) {
+    for (k = 0; k < l1ido; k += ido) {
+      const v4sf * RESTRICT pc = (v4sf*)(cc - 1 + 4*k);
+      v4sf * RESTRICT ph = (v4sf*)(ch + k + 1);
+      for (i = 2; i < ido; i += 2) {
+
+        tr1 = VSUB(pc[i], pc[4*ido - i]);
+        tr2 = VADD(pc[i], pc[4*ido - i]);
+        ti4 = VSUB(pc[2*ido + i], pc[2*ido - i]);
+        tr3 = VADD(pc[2*ido + i], pc[2*ido - i]);
+        ph[0] = VADD(tr2, tr3);
+        cr3 = VSUB(tr2, tr3);
+
+        ti3 = VSUB(pc[2*ido + i + 1], pc[2*ido - i + 1]);
+        tr4 = VADD(pc[2*ido + i + 1], pc[2*ido - i + 1]);
+        cr2 = VSUB(tr1, tr4);
+        cr4 = VADD(tr1, tr4);
+
+        ti1 = VADD(pc[i + 1], pc[4*ido - i + 1]);
+        ti2 = VSUB(pc[i + 1], pc[4*ido - i + 1]);
+
+        ph[1] = VADD(ti2, ti3); ph += l1ido;
+        ci3 = VSUB(ti2, ti3);
+        ci2 = VADD(ti1, ti4);
+        ci4 = VSUB(ti1, ti4);
+        VCPLXMUL(cr2, ci2, LD_PS1(wa1[i-2]), LD_PS1(wa1[i-1]));
+        ph[0] = cr2;
+        ph[1] = ci2; ph += l1ido;
+        VCPLXMUL(cr3, ci3, LD_PS1(wa2[i-2]), LD_PS1(wa2[i-1]));
+        ph[0] = cr3;
+        ph[1] = ci3; ph += l1ido;
+        VCPLXMUL(cr4, ci4, LD_PS1(wa3[i-2]), LD_PS1(wa3[i-1]));
+        ph[0] = cr4;
+        ph[1] = ci4; ph = ph - 3*l1ido + 2;
+      }
+    }
+    if (ido % 2 == 1) return;
+  }
+  for (k=0; k < l1ido; k+=ido) {
+    int i0 = 4*k + ido;
+    v4sf c = cc[i0-1], d = cc[i0 + 2*ido-1];
+    v4sf a = cc[i0+0], b = cc[i0 + 2*ido+0];
+    tr1 = VSUB(c,d);
+    tr2 = VADD(c,d);
+    ti1 = VADD(b,a);
+    ti2 = VSUB(b,a);
+    ch[ido-1 + k + 0*l1ido] = VADD(tr2,tr2);
+    ch[ido-1 + k + 1*l1ido] = SVMUL(minus_sqrt2, VSUB(ti1, tr1));
+    ch[ido-1 + k + 2*l1ido] = VADD(ti2, ti2);
+    ch[ido-1 + k + 3*l1ido] = SVMUL(minus_sqrt2, VADD(ti1, tr1));
+  }
+} /* radb4 */
+
+#if 0
+static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc, v4sf * RESTRICT ch,
+                     const float *wa1, const float *wa2, const float *wa3, const float *wa4)
+{
+  static const float tr11 = .309016994374947f;
+  static const float ti11 = .951056516295154f;
+  static const float tr12 = -.809016994374947f;
+  static const float ti12 = .587785252292473f;
+
+  /* System generated locals */
+  int cc_offset, ch_offset;
+
+  /* Local variables */
+  int i, k, ic;
+  v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4, dr5,
+    cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
+  int idp2;
+
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * 6;
+  ch -= ch_offset;
+  cc_offset = 1 + ido * (1 + l1);
+  cc -= cc_offset;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
+    ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
+    cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
+    ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
+    ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
+    ch_ref(ido, 2, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
+    ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
+    ch_ref(ido, 4, k) = VADD(cc_ref(1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
+    ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+    /*printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4); */
+  }
+  if (ido == 1) {
+    return;
+  }
+  idp2 = ido + 2;
+  for (k = 1; k <= l1; ++k) {
+    for (i = 3; i <= ido; i += 2) {
+      ic = idp2 - i;
+      dr2 = LD_PS1(wa1[i-3]); di2 = LD_PS1(wa1[i-2]);
+      dr3 = LD_PS1(wa2[i-3]); di3 = LD_PS1(wa2[i-2]);
+      dr4 = LD_PS1(wa3[i-3]); di4 = LD_PS1(wa3[i-2]);
+      dr5 = LD_PS1(wa4[i-3]); di5 = LD_PS1(wa4[i-2]);
+      VCPLXMULCONJ(dr2, di2, cc_ref(i-1, k, 2), cc_ref(i, k, 2));
+      VCPLXMULCONJ(dr3, di3, cc_ref(i-1, k, 3), cc_ref(i, k, 3));
+      VCPLXMULCONJ(dr4, di4, cc_ref(i-1, k, 4), cc_ref(i, k, 4));
+      VCPLXMULCONJ(dr5, di5, cc_ref(i-1, k, 5), cc_ref(i, k, 5));
+      cr2 = VADD(dr2, dr5);
+      ci5 = VSUB(dr5, dr2);
+      cr5 = VSUB(di2, di5);
+      ci2 = VADD(di2, di5);
+      cr3 = VADD(dr3, dr4);
+      ci4 = VSUB(dr4, dr3);
+      cr4 = VSUB(di3, di4);
+      ci3 = VADD(di3, di4);
+      ch_ref(i - 1, 1, k) = VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
+      ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));/* */
+      tr2 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
+      ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));/* */
+      tr3 = VADD(cc_ref(i - 1, k, 1), VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
+      ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));/* */
+      tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
+      ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
+      tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
+      ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+      ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
+      ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
+      ch_ref(i, 3, k) = VADD(ti2, ti5);
+      ch_ref(ic, 2, k) = VSUB(ti5, ti2);
+      ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
+      ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
+      ch_ref(i, 5, k) = VADD(ti3, ti4);
+      ch_ref(ic, 4, k) = VSUB(ti4, ti3);
+    }
+  }
+#undef cc_ref
+#undef ch_ref
+} /* radf5 */
+
+static void radb5_ps(int ido, int l1, const v4sf *RESTRICT cc, v4sf *RESTRICT ch,
+                  const float *wa1, const float *wa2, const float *wa3, const float *wa4)
+{
+  static const float tr11 = .309016994374947f;
+  static const float ti11 = .951056516295154f;
+  static const float tr12 = -.809016994374947f;
+  static const float ti12 = .587785252292473f;
+
+  int cc_offset, ch_offset;
+
+  /* Local variables */
+  int i, k, ic;
+  v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2, ti3,
+    ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+  int idp2;
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+  /* Parameter adjustments */
+  ch_offset = 1 + ido * (1 + l1);
+  ch -= ch_offset;
+  cc_offset = 1 + ido * 6;
+  cc -= cc_offset;
+
+  /* Function Body */
+  for (k = 1; k <= l1; ++k) {
+    ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
+    ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
+    tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
+    tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
+    ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
+    cr2 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
+    cr3 = VADD(cc_ref(1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
+    ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+    ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+    ch_ref(1, k, 2) = VSUB(cr2, ci5);
+    ch_ref(1, k, 3) = VSUB(cr3, ci4);
+    ch_ref(1, k, 4) = VADD(cr3, ci4);
+    ch_ref(1, k, 5) = VADD(cr2, ci5);
+  }
+  if (ido == 1) {
+    return;
+  }
+  idp2 = ido + 2;
+  for (k = 1; k <= l1; ++k) {
+    for (i = 3; i <= ido; i += 2) {
+      ic = idp2 - i;
+      ti5 = VADD(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
+      ti2 = VSUB(cc_ref(i  , 3, k), cc_ref(ic  , 2, k));
+      ti4 = VADD(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
+      ti3 = VSUB(cc_ref(i  , 5, k), cc_ref(ic  , 4, k));
+      tr5 = VSUB(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
+      tr2 = VADD(cc_ref(i-1, 3, k), cc_ref(ic-1, 2, k));
+      tr4 = VSUB(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
+      tr3 = VADD(cc_ref(i-1, 5, k), cc_ref(ic-1, 4, k));
+      ch_ref(i - 1, k, 1) = VADD(cc_ref(i-1, 1, k), VADD(tr2, tr3));
+      ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
+      cr2 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
+      ci2 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
+      cr3 = VADD(cc_ref(i-1, 1, k), VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
+      ci3 = VADD(cc_ref(i  , 1, k), VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
+      cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
+      ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+      cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
+      ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+      dr3 = VSUB(cr3, ci4);
+      dr4 = VADD(cr3, ci4);
+      di3 = VADD(ci3, cr4);
+      di4 = VSUB(ci3, cr4);
+      dr5 = VADD(cr2, ci5);
+      dr2 = VSUB(cr2, ci5);
+      di5 = VSUB(ci2, cr5);
+      di2 = VADD(ci2, cr5);
+      VCPLXMUL(dr2, di2, LD_PS1(wa1[i-3]), LD_PS1(wa1[i-2]));
+      VCPLXMUL(dr3, di3, LD_PS1(wa2[i-3]), LD_PS1(wa2[i-2]));
+      VCPLXMUL(dr4, di4, LD_PS1(wa3[i-3]), LD_PS1(wa3[i-2]));
+      VCPLXMUL(dr5, di5, LD_PS1(wa4[i-3]), LD_PS1(wa4[i-2]));
+
+      ch_ref(i-1, k, 2) = dr2; ch_ref(i, k, 2) = di2;
+      ch_ref(i-1, k, 3) = dr3; ch_ref(i, k, 3) = di3;
+      ch_ref(i-1, k, 4) = dr4; ch_ref(i, k, 4) = di4;
+      ch_ref(i-1, k, 5) = dr5; ch_ref(i, k, 5) = di5;
+    }
+  }
+#undef cc_ref
+#undef ch_ref
+} /* radb5 */
+#endif
+
+static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
+                                      const float *wa, const int *ifac) {
+  v4sf *in  = (v4sf*)input_readonly;
+  v4sf *out = (in == work2 ? work1 : work2);
+  int nf = ifac[1], k1;
+  int l2 = n;
+  int iw = n-1;
+  assert(in != out && work1 != work2);
+  for (k1 = 1; k1 <= nf; ++k1) {
+    int kh = nf - k1;
+    int ip = ifac[kh + 2];
+    int l1 = l2 / ip;
+    int ido = n / l2;
+    iw -= (ip - 1)*ido;
+    switch (ip) {
+#if 0
+      case 5: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        int ix4 = ix3 + ido;
+        radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
+      } break;
+#endif
+      case 4: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
+      } break;
+#if 0
+      case 3: {
+        int ix2 = iw + ido;
+        radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
+      } break;
+#endif
+      case 2:
+        radf2_ps(ido, l1, in, out, &wa[iw]);
+        break;
+      default:
+        assert(0);
+        break;
+    }
+    l2 = l1;
+    if (out == work2) {
+      out = work1; in = work2;
+    } else {
+      out = work2; in = work1;
+    }
+  }
+  return in; /* this is in fact the output .. */
+} /* rfftf1 */
+
+static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2,
+                                      const float *wa, const int *ifac) {
+  v4sf *in  = (v4sf*)input_readonly;
+  v4sf *out = (in == work2 ? work1 : work2);
+  int nf = ifac[1], k1;
+  int l1 = 1;
+  int iw = 0;
+  assert(in != out);
+  for (k1=1; k1<=nf; k1++) {
+    int ip = ifac[k1 + 1];
+    int l2 = ip*l1;
+    int ido = n / l2;
+    switch (ip) {
+#if 0
+      case 5: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        int ix4 = ix3 + ido;
+        radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4]);
+      } break;
+#endif
+      case 4: {
+        int ix2 = iw + ido;
+        int ix3 = ix2 + ido;
+        radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3]);
+      } break;
+#if 0
+      case 3: {
+        int ix2 = iw + ido;
+        radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
+      } break;
+#endif
+      case 2:
+        radb2_ps(ido, l1, in, out, &wa[iw]);
+        break;
+      default:
+        assert(0);
+        break;
+    }
+    l1 = l2;
+    iw += (ip - 1)*ido;
+
+    if (out == work2) {
+      out = work1; in = work2;
+    } else {
+      out = work2; in = work1;
+    }
+  }
+  return in; /* this is in fact the output .. */
+}
+
+static int decompose(int n, int *ifac, const int *ntryh) {
+  int nl = n, nf = 0, i, j = 0;
+  for (j=0; ntryh[j]; ++j) {
+    int ntry = ntryh[j];
+    while (nl != 1) {
+      int nq = nl / ntry;
+      int nr = nl - ntry * nq;
+      if (nr == 0) {
+        ifac[2+nf++] = ntry;
+        nl = nq;
+        if (ntry == 2 && nf != 1) {
+          for (i = 2; i <= nf; ++i) {
+            int ib = nf - i + 2;
+            ifac[ib + 1] = ifac[ib];
+          }
+          ifac[2] = 2;
+        }
+      } else break;
+    }
+  }
+  ifac[0] = n;
+  ifac[1] = nf;
+  return nf;
+}
+
+
+
+static void rffti1_ps(int n, float *wa, int *ifac)
+{
+  static const int ntryh[] = { 4,2,3,5,0 };
+  int k1, j, ii;
+
+  int nf = decompose(n,ifac,ntryh);
+  float argh = (float)((2*M_PI) / n);
+  int is = 0;
+  int nfm1 = nf - 1;
+  int l1 = 1;
+  for (k1 = 1; k1 <= nfm1; k1++) {
+    int ip = ifac[k1 + 1];
+    int ld = 0;
+    int l2 = l1*ip;
+    int ido = n / l2;
+    int ipm = ip - 1;
+    for (j = 1; j <= ipm; ++j) {
+      float argld;
+      int i = is, fi=0;
+      ld += l1;
+      argld = (float)ld*argh;
+      for (ii = 3; ii <= ido; ii += 2) {
+        i += 2;
+        fi += 1;
+        wa[i - 2] = cos((float)fi*argld);
+        wa[i - 1] = sin((float)fi*argld);
+      }
+      is += ido;
+    }
+    l1 = l2;
+  }
+} /* rffti1 */
+
+static
+void cffti1_ps(int n, float *wa, int *ifac)
+{
+  static const int ntryh[] = { 5,3,4,2,0 };
+  int k1, j, ii;
+
+  int nf = decompose(n,ifac,ntryh);
+  float argh = (float)((2*M_PI)/n);
+  int i = 1;
+  int l1 = 1;
+  for (k1=1; k1<=nf; k1++) {
+    int ip = ifac[k1+1];
+    int ld = 0;
+    int l2 = l1*ip;
+    int ido = n / l2;
+    int idot = ido + ido + 2;
+    int ipm = ip - 1;
+    for (j=1; j<=ipm; j++) {
+      float argld;
+      int i1 = i, fi = 0;
+      wa[i-1] = 1;
+      wa[i] = 0;
+      ld += l1;
+      argld = (float)ld*argh;
+      for (ii = 4; ii <= idot; ii += 2) {
+        i += 2;
+        fi += 1;
+        wa[i-1] = cos((float)fi*argld);
+        wa[i] = sin((float)fi*argld);
+      }
+      if (ip > 5) {
+        wa[i1-1] = wa[i-1];
+        wa[i1] = wa[i];
+      }
+    }
+    l1 = l2;
+  }
+} /* cffti1 */
+
+
+static
+v4sf *cfftf1_ps(int n, const v4sf *input_readonly, v4sf *work1, v4sf *work2, const float *wa, const int *ifac, int isign) {
+  v4sf *in  = (v4sf*)input_readonly;
+  v4sf *out = (in == work2 ? work1 : work2);
+  int nf = ifac[1], k1;
+  int l1 = 1;
+  int iw = 0;
+  assert(in != out && work1 != work2);
+  for (k1=2; k1<=nf+1; k1++) {
+    int ip = ifac[k1];
+    int l2 = ip*l1;
+    int ido = n / l2;
+    int idot = ido + ido;
+    switch (ip) {
+#if 0
+      case 5: {
+        int ix2 = iw + idot;
+        int ix3 = ix2 + idot;
+        int ix4 = ix3 + idot;
+        passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], &wa[ix4], (float)isign);
+      } break;
+#endif
+      case 4: {
+        int ix2 = iw + idot;
+        int ix3 = ix2 + idot;
+        passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2], &wa[ix3], (float)isign);
+      } break;
+      case 2: {
+        passf2_ps(idot, l1, in, out, &wa[iw], (float)isign);
+      } break;
+#if 0
+      case 3: {
+        int ix2 = iw + idot;
+        passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2], (float)isign);
+      } break;
+#endif
+      default:
+        assert(0);
+    }
+    l1 = l2;
+    iw += (ip - 1)*idot;
+    if (out == work2) {
+      out = work1; in = work2;
+    } else {
+      out = work2; in = work1;
+    }
+  }
+
+  return in; /* this is in fact the output .. */
+}
+
+
+struct PFFFT_Setup {
+  int     N;
+  int     Ncvec; /* nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL) */
+  int ifac[15];
+  pffft_transform_t transform;
+  v4sf *data; /* allocated room for twiddle coefs */
+  float *e;    /* points into 'data' , N/4*3 elements */
+  float *twiddle; /* points into 'data', N/4 elements */
+};
+
+static
+PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform) {
+  PFFFT_Setup *s = (PFFFT_Setup*)malloc(sizeof(PFFFT_Setup));
+  int k, m;
+  if (!s) return s;
+  /* unfortunately, the fft size must be a multiple of 16 for complex FFTs
+     and 32 for real FFTs -- a lot of stuff would need to be rewritten to
+     handle other cases (or maybe just switch to a scalar fft, I don't know..) */
+  if (transform == PFFFT_REAL) { assert((N%(2*SIMD_SZ*SIMD_SZ))==0 && N>0); }
+  if (transform == PFFFT_COMPLEX) { assert((N%(SIMD_SZ*SIMD_SZ))==0 && N>0); }
+  /*assert((N % 32) == 0); */
+  s->N = N;
+  s->transform = transform;
+  /* nb of complex simd vectors */
+  s->Ncvec = (transform == PFFFT_REAL ? N/2 : N)/SIMD_SZ;
+  s->data = (v4sf*)pffft_aligned_malloc(2*(size_t)s->Ncvec * sizeof(v4sf));
+  if (!s->data) {free(s); return 0;}
+  s->e = (float*)s->data;
+  s->twiddle = (float*)(s->data + (2*s->Ncvec*(SIMD_SZ-1))/SIMD_SZ);
+
+  if (transform == PFFFT_REAL) {
+    for (k=0; k < s->Ncvec; ++k) {
+      int i = k/SIMD_SZ;
+      int j = k%SIMD_SZ;
+      for (m=0; m < SIMD_SZ-1; ++m) {
+        float A = (float)(-2*M_PI*(m+1)*k / N);
+        s->e[(2*(i*3 + m) + 0) * SIMD_SZ + j] = cos(A);
+        s->e[(2*(i*3 + m) + 1) * SIMD_SZ + j] = sin(A);
+      }
+    }
+    rffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
+  } else {
+    for (k=0; k < s->Ncvec; ++k) {
+      int i = k/SIMD_SZ;
+      int j = k%SIMD_SZ;
+      for (m=0; m < SIMD_SZ-1; ++m) {
+        float A = (float)(-2*M_PI*(m+1)*k / N);
+        s->e[(2*(i*3 + m) + 0)*SIMD_SZ + j] = cos(A);
+        s->e[(2*(i*3 + m) + 1)*SIMD_SZ + j] = sin(A);
+      }
+    }
+    cffti1_ps(N/SIMD_SZ, s->twiddle, s->ifac);
+  }
+
+  /* check that N is decomposable with allowed prime factors */
+  for (k=0, m=1; k < s->ifac[1]; ++k) { m *= s->ifac[2+k]; }
+  if (m != N/SIMD_SZ) {
+    pffft_destroy_setup(s); s = 0;
+  }
+
+  return s;
+}
+
+
+static
+void pffft_destroy_setup(PFFFT_Setup *s) {
+  if (!s) return;
+  pffft_aligned_free(s->data);
+  free(s);
+}
+
+#if !defined(PFFFT_SIMD_DISABLE)
+
+/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
+static void reversed_copy(int N, const v4sf *in, int in_stride, v4sf *out) {
+  v4sf g0, g1;
+  int k;
+  INTERLEAVE2(in[0], in[1], g0, g1); in += in_stride;
+
+  *--out = VSWAPHL(g0, g1); /* [g0l, g0h], [g1l g1h] -> [g1l, g0h] */
+  for (k=1; k < N; ++k) {
+    v4sf h0, h1;
+    INTERLEAVE2(in[0], in[1], h0, h1); in += in_stride;
+    *--out = VSWAPHL(g1, h0);
+    *--out = VSWAPHL(h0, h1);
+    g1 = h1;
+  }
+  *--out = VSWAPHL(g1, g0);
+}
+
+static void unreversed_copy(int N, const v4sf *in, v4sf *out, int out_stride) {
+  v4sf g0, g1, h0, h1;
+  int k;
+  g0 = g1 = in[0]; ++in;
+  for (k=1; k < N; ++k) {
+    h0 = *in++; h1 = *in++;
+    g1 = VSWAPHL(g1, h0);
+    h0 = VSWAPHL(h0, h1);
+    UNINTERLEAVE2(h0, g1, out[0], out[1]); out += out_stride;
+    g1 = h1;
+  }
+  h0 = *in++; h1 = g0;
+  g1 = VSWAPHL(g1, h0);
+  h0 = VSWAPHL(h0, h1);
+  UNINTERLEAVE2(h0, g1, out[0], out[1]);
+}
+
+static
+void pffft_zreorder(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
+  int k, N = setup->N, Ncvec = setup->Ncvec;
+  const v4sf *vin = (const v4sf*)in;
+  v4sf *vout = (v4sf*)out;
+  assert(in != out);
+  if (setup->transform == PFFFT_REAL) {
+    int k, dk = N/32;
+    if (direction == PFFFT_FORWARD) {
+      for (k=0; k < dk; ++k) {
+        INTERLEAVE2(vin[k*8 + 0], vin[k*8 + 1], vout[2*(0*dk + k) + 0], vout[2*(0*dk + k) + 1]);
+        INTERLEAVE2(vin[k*8 + 4], vin[k*8 + 5], vout[2*(2*dk + k) + 0], vout[2*(2*dk + k) + 1]);
+      }
+      reversed_copy(dk, vin+2, 8, (v4sf*)(out + N/2));
+      reversed_copy(dk, vin+6, 8, (v4sf*)(out + N));
+    } else {
+      for (k=0; k < dk; ++k) {
+        UNINTERLEAVE2(vin[2*(0*dk + k) + 0], vin[2*(0*dk + k) + 1], vout[k*8 + 0], vout[k*8 + 1]);
+        UNINTERLEAVE2(vin[2*(2*dk + k) + 0], vin[2*(2*dk + k) + 1], vout[k*8 + 4], vout[k*8 + 5]);
+      }
+      unreversed_copy(dk, (v4sf*)(in + N/4), (v4sf*)(out + N - 6*SIMD_SZ), -8);
+      unreversed_copy(dk, (v4sf*)(in + 3*N/4), (v4sf*)(out + N - 2*SIMD_SZ), -8);
+    }
+  } else {
+    if (direction == PFFFT_FORWARD) {
+      for (k=0; k < Ncvec; ++k) {
+        int kk = (k/4) + (k%4)*(Ncvec/4);
+        INTERLEAVE2(vin[k*2], vin[k*2+1], vout[kk*2], vout[kk*2+1]);
+      }
+    } else {
+      for (k=0; k < Ncvec; ++k) {
+        int kk = (k/4) + (k%4)*(Ncvec/4);
+        UNINTERLEAVE2(vin[kk*2], vin[kk*2+1], vout[k*2], vout[k*2+1]);
+      }
+    }
+  }
+}
+
+static
+void pffft_cplx_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
+  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+  assert(in != out);
+  for (k=0; k < dk; ++k) {
+    r0 = in[8*k+0]; i0 = in[8*k+1];
+    r1 = in[8*k+2]; i1 = in[8*k+3];
+    r2 = in[8*k+4]; i2 = in[8*k+5];
+    r3 = in[8*k+6]; i3 = in[8*k+7];
+    VTRANSPOSE4(r0,r1,r2,r3);
+    VTRANSPOSE4(i0,i1,i2,i3);
+    VCPLXMUL(r1,i1,e[k*6+0],e[k*6+1]);
+    VCPLXMUL(r2,i2,e[k*6+2],e[k*6+3]);
+    VCPLXMUL(r3,i3,e[k*6+4],e[k*6+5]);
+
+    sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
+    sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
+    si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
+    si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+
+    /*
+      transformation for each column is:
+
+      [1   1   1   1   0   0   0   0]   [r0]
+      [1   0  -1   0   0  -1   0   1]   [r1]
+      [1  -1   1  -1   0   0   0   0]   [r2]
+      [1   0  -1   0   0   1   0  -1]   [r3]
+      [0   0   0   0   1   1   1   1] * [i0]
+      [0   1   0  -1   1   0  -1   0]   [i1]
+      [0   0   0   0   1  -1   1  -1]   [i2]
+      [0  -1   0   1   1   0  -1   0]   [i3]
+    */
+
+    r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
+    r1 = VADD(dr0, di1); i1 = VSUB(di0, dr1);
+    r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
+    r3 = VSUB(dr0, di1); i3 = VADD(di0, dr1);
+
+    *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
+    *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
+  }
+}
+
+static
+void pffft_cplx_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
+  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+  assert(in != out);
+  for (k=0; k < dk; ++k) {
+    r0 = in[8*k+0]; i0 = in[8*k+1];
+    r1 = in[8*k+2]; i1 = in[8*k+3];
+    r2 = in[8*k+4]; i2 = in[8*k+5];
+    r3 = in[8*k+6]; i3 = in[8*k+7];
+
+    sr0 = VADD(r0,r2); dr0 = VSUB(r0, r2);
+    sr1 = VADD(r1,r3); dr1 = VSUB(r1, r3);
+    si0 = VADD(i0,i2); di0 = VSUB(i0, i2);
+    si1 = VADD(i1,i3); di1 = VSUB(i1, i3);
+
+    r0 = VADD(sr0, sr1); i0 = VADD(si0, si1);
+    r1 = VSUB(dr0, di1); i1 = VADD(di0, dr1);
+    r2 = VSUB(sr0, sr1); i2 = VSUB(si0, si1);
+    r3 = VADD(dr0, di1); i3 = VSUB(di0, dr1);
+
+    VCPLXMULCONJ(r1,i1,e[k*6+0],e[k*6+1]);
+    VCPLXMULCONJ(r2,i2,e[k*6+2],e[k*6+3]);
+    VCPLXMULCONJ(r3,i3,e[k*6+4],e[k*6+5]);
+
+    VTRANSPOSE4(r0,r1,r2,r3);
+    VTRANSPOSE4(i0,i1,i2,i3);
+
+    *out++ = r0; *out++ = i0; *out++ = r1; *out++ = i1;
+    *out++ = r2; *out++ = i2; *out++ = r3; *out++ = i3;
+  }
+}
+
+
+static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf *in0, const v4sf *in1, const v4sf *in,
+                            const v4sf *e, v4sf *out) {
+  v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+  v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+  r0 = *in0; i0 = *in1;
+  r1 = *in++; i1 = *in++; r2 = *in++; i2 = *in++; r3 = *in++; i3 = *in++;
+  VTRANSPOSE4(r0,r1,r2,r3);
+  VTRANSPOSE4(i0,i1,i2,i3);
+
+  /*
+    transformation for each column is:
+
+    [1   1   1   1   0   0   0   0]   [r0]
+    [1   0  -1   0   0  -1   0   1]   [r1]
+    [1   0  -1   0   0   1   0  -1]   [r2]
+    [1  -1   1  -1   0   0   0   0]   [r3]
+    [0   0   0   0   1   1   1   1] * [i0]
+    [0  -1   0   1  -1   0   1   0]   [i1]
+    [0  -1   0   1   1   0  -1   0]   [i2]
+    [0   0   0   0  -1   1  -1   1]   [i3]
+  */
+
+  /*cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */
+  /*cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */
+
+  VCPLXMUL(r1,i1,e[0],e[1]);
+  VCPLXMUL(r2,i2,e[2],e[3]);
+  VCPLXMUL(r3,i3,e[4],e[5]);
+
+  /*cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n"; */
+  /*cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n"; */
+
+  sr0 = VADD(r0,r2); dr0 = VSUB(r0,r2);
+  sr1 = VADD(r1,r3); dr1 = VSUB(r3,r1);
+  si0 = VADD(i0,i2); di0 = VSUB(i0,i2);
+  si1 = VADD(i1,i3); di1 = VSUB(i3,i1);
+
+  r0 = VADD(sr0, sr1);
+  r3 = VSUB(sr0, sr1);
+  i0 = VADD(si0, si1);
+  i3 = VSUB(si1, si0);
+  r1 = VADD(dr0, di1);
+  r2 = VSUB(dr0, di1);
+  i1 = VSUB(dr1, di0);
+  i2 = VADD(dr1, di0);
+
+  *out++ = r0;
+  *out++ = i0;
+  *out++ = r1;
+  *out++ = i1;
+  *out++ = r2;
+  *out++ = i2;
+  *out++ = r3;
+  *out++ = i3;
+
+}
+
+static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
+  /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+  v4sf_union cr, ci, *uout = (v4sf_union*)out;
+  v4sf save = in[7], zero=VZERO();
+  float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
+  static const float s = (float)(M_SQRT2/2);
+
+  cr.v = in[0]; ci.v = in[Ncvec*2-1];
+  assert(in != out);
+  pffft_real_finalize_4x4(&zero, &zero, in+1, e, out);
+
+  /*
+    [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
+
+    [Xr(1)]  ] [1   1   1   1   0   0   0   0]
+    [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
+    [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
+    [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
+    [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
+    [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
+    [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
+    [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
+  */
+
+  xr0=(cr.f[0]+cr.f[2]) + (cr.f[1]+cr.f[3]); uout[0].f[0] = xr0;
+  xi0=(cr.f[0]+cr.f[2]) - (cr.f[1]+cr.f[3]); uout[1].f[0] = xi0;
+  xr2=(cr.f[0]-cr.f[2]);                     uout[4].f[0] = xr2;
+  xi2=(cr.f[3]-cr.f[1]);                     uout[5].f[0] = xi2;
+  xr1= ci.f[0] + s*(ci.f[1]-ci.f[3]);        uout[2].f[0] = xr1;
+  xi1=-ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[3].f[0] = xi1;
+  xr3= ci.f[0] - s*(ci.f[1]-ci.f[3]);        uout[6].f[0] = xr3;
+  xi3= ci.f[2] - s*(ci.f[1]+ci.f[3]);        uout[7].f[0] = xi3;
+
+  for (k=1; k < dk; ++k) {
+    v4sf save_next = in[8*k+7];
+    pffft_real_finalize_4x4(&save, &in[8*k+0], in + 8*k+1,
+                           e + k*6, out + k*8);
+    save = save_next;
+  }
+
+}
+
+static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf *in,
+                                             const v4sf *e, v4sf *out, int first) {
+  v4sf r0=in[0], i0=in[1], r1=in[2], i1=in[3], r2=in[4], i2=in[5], r3=in[6], i3=in[7];
+  /*
+    transformation for each column is:
+
+    [1   1   1   1   0   0   0   0]   [r0]
+    [1   0   0  -1   0  -1  -1   0]   [r1]
+    [1  -1  -1   1   0   0   0   0]   [r2]
+    [1   0   0  -1   0   1   1   0]   [r3]
+    [0   0   0   0   1  -1   1  -1] * [i0]
+    [0  -1   1   0   1   0   0   1]   [i1]
+    [0   0   0   0   1   1  -1  -1]   [i2]
+    [0   1  -1   0   1   0   0   1]   [i3]
+  */
+
+  v4sf sr0 = VADD(r0,r3), dr0 = VSUB(r0,r3);
+  v4sf sr1 = VADD(r1,r2), dr1 = VSUB(r1,r2);
+  v4sf si0 = VADD(i0,i3), di0 = VSUB(i0,i3);
+  v4sf si1 = VADD(i1,i2), di1 = VSUB(i1,i2);
+
+  r0 = VADD(sr0, sr1);
+  r2 = VSUB(sr0, sr1);
+  r1 = VSUB(dr0, si1);
+  r3 = VADD(dr0, si1);
+  i0 = VSUB(di0, di1);
+  i2 = VADD(di0, di1);
+  i1 = VSUB(si0, dr1);
+  i3 = VADD(si0, dr1);
+
+  VCPLXMULCONJ(r1,i1,e[0],e[1]);
+  VCPLXMULCONJ(r2,i2,e[2],e[3]);
+  VCPLXMULCONJ(r3,i3,e[4],e[5]);
+
+  VTRANSPOSE4(r0,r1,r2,r3);
+  VTRANSPOSE4(i0,i1,i2,i3);
+
+  if (!first) {
+    *out++ = r0;
+    *out++ = i0;
+  }
+  *out++ = r1;
+  *out++ = i1;
+  *out++ = r2;
+  *out++ = i2;
+  *out++ = r3;
+  *out++ = i3;
+}
+
+static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf *in, v4sf *out, const v4sf *e) {
+  int k, dk = Ncvec/SIMD_SZ; /* number of 4x4 matrix blocks */
+  /* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+  v4sf_union Xr, Xi, *uout = (v4sf_union*)out;
+  float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
+  static const float s = (float)M_SQRT2;
+  assert(in != out);
+  for (k=0; k < 4; ++k) {
+    Xr.f[k] = ((float*)in)[8*k];
+    Xi.f[k] = ((float*)in)[8*k+4];
+  }
+
+  pffft_real_preprocess_4x4(in, e, out+1, 1); /* will write only 6 values */
+
+  /*
+    [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
+
+    [cr0] [1   0   2   0   1   0   0   0]
+    [cr1] [1   0   0   0  -1   0  -2   0]
+    [cr2] [1   0  -2   0   1   0   0   0]
+    [cr3] [1   0   0   0  -1   0   2   0]
+    [ci0] [0   2   0   2   0   0   0   0]
+    [ci1] [0   s   0  -s   0  -s   0  -s]
+    [ci2] [0   0   0   0   0  -2   0   2]
+    [ci3] [0  -s   0   s   0  -s   0  -s]
+  */
+  for (k=1; k < dk; ++k) {
+    pffft_real_preprocess_4x4(in+8*k, e + k*6, out-1+k*8, 0);
+  }
+
+  cr0=(Xr.f[0]+Xi.f[0]) + 2*Xr.f[2]; uout[0].f[0] = cr0;
+  cr1=(Xr.f[0]-Xi.f[0]) - 2*Xi.f[2]; uout[0].f[1] = cr1;
+  cr2=(Xr.f[0]+Xi.f[0]) - 2*Xr.f[2]; uout[0].f[2] = cr2;
+  cr3=(Xr.f[0]-Xi.f[0]) + 2*Xi.f[2]; uout[0].f[3] = cr3;
+  ci0= 2*(Xr.f[1]+Xr.f[3]);                       uout[2*Ncvec-1].f[0] = ci0;
+  ci1= s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[1] = ci1;
+  ci2= 2*(Xi.f[3]-Xi.f[1]);                       uout[2*Ncvec-1].f[2] = ci2;
+  ci3=-s*(Xr.f[1]-Xr.f[3]) - s*(Xi.f[1]+Xi.f[3]); uout[2*Ncvec-1].f[3] = ci3;
+}
+
+
+static
+void pffft_transform_internal(PFFFT_Setup *setup, const float *finput, float *foutput, v4sf *scratch,
+                             pffft_direction_t direction, int ordered) {
+  int k, Ncvec   = setup->Ncvec;
+  int nf_odd = (setup->ifac[1] & 1);
+
+#if 0
+  /* temporary buffer is allocated on the stack if the scratch pointer is NULL */
+  int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
+  VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
+#endif
+
+  const v4sf *vinput = (const v4sf*)finput;
+  v4sf *voutput      = (v4sf*)foutput;
+  v4sf *buff[2];
+  int ib = (nf_odd ^ ordered ? 1 : 0);
+  buff[0] = voutput; buff[1] = scratch;
+
+  assert(VALIGNED(finput) && VALIGNED(foutput));
+
+  /*assert(finput != foutput); */
+  if (direction == PFFFT_FORWARD) {
+    ib = !ib;
+    if (setup->transform == PFFFT_REAL) {
+      ib = (rfftf1_ps(Ncvec*2, vinput, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+      pffft_real_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
+    } else {
+      v4sf *tmp = buff[ib];
+      for (k=0; k < Ncvec; ++k) {
+        UNINTERLEAVE2(vinput[k*2], vinput[k*2+1], tmp[k*2], tmp[k*2+1]);
+      }
+      ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
+                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
+      pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib], (v4sf*)setup->e);
+    }
+    if (ordered) {
+      pffft_zreorder(setup, (float*)buff[!ib], (float*)buff[ib], PFFFT_FORWARD);
+    } else ib = !ib;
+  } else {
+    if (vinput == buff[ib]) {
+      ib = !ib; /* may happen when finput == foutput */
+    }
+    if (ordered) {
+      pffft_zreorder(setup, (float*)vinput, (float*)buff[ib], PFFFT_BACKWARD);
+      vinput = buff[ib]; ib = !ib;
+    }
+    if (setup->transform == PFFFT_REAL) {
+      pffft_real_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
+      ib = (rfftb1_ps(Ncvec*2, buff[ib], buff[0], buff[1],
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+    } else {
+      pffft_cplx_preprocess(Ncvec, vinput, buff[ib], (v4sf*)setup->e);
+      ib = (cfftf1_ps(Ncvec, buff[ib], buff[0], buff[1],
+                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
+      for (k=0; k < Ncvec; ++k) {
+        INTERLEAVE2(buff[ib][k*2], buff[ib][k*2+1], buff[ib][k*2], buff[ib][k*2+1]);
+      }
+    }
+  }
+
+  if (buff[ib] != voutput) {
+    /* extra copy required -- this situation should only happen when finput == foutput */
+    assert(finput==foutput);
+    for (k=0; k < Ncvec; ++k) {
+      v4sf a = buff[ib][2*k], b = buff[ib][2*k+1];
+      voutput[2*k] = a; voutput[2*k+1] = b;
+    }
+    ib = !ib;
+  }
+  assert(buff[ib] == voutput);
+}
+
+#if 0
+void pffft_zconvolve_accumulate(PFFFT_Setup *s, const float *a, const float *b, float *ab, float scaling) {
+  int Ncvec = s->Ncvec;
+  const v4sf * RESTRICT va = (const v4sf*)a;
+  const v4sf * RESTRICT vb = (const v4sf*)b;
+  v4sf * RESTRICT vab = (v4sf*)ab;
+
+#ifdef __arm__
+  __builtin_prefetch(va);
+  __builtin_prefetch(vb);
+  __builtin_prefetch(vab);
+  __builtin_prefetch(va+2);
+  __builtin_prefetch(vb+2);
+  __builtin_prefetch(vab+2);
+  __builtin_prefetch(va+4);
+  __builtin_prefetch(vb+4);
+  __builtin_prefetch(vab+4);
+  __builtin_prefetch(va+6);
+  __builtin_prefetch(vb+6);
+  __builtin_prefetch(vab+6);
+# ifndef __clang__
+#   define ZCONVOLVE_USING_INLINE_NEON_ASM
+# endif
+#endif
+
+  float ar, ai, br, bi, abr, abi;
+#ifndef ZCONVOLVE_USING_INLINE_ASM
+  v4sf vscal = LD_PS1(scaling);
+  int i;
+#endif
+
+  assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
+  ar = ((v4sf_union*)va)[0].f[0];
+  ai = ((v4sf_union*)va)[1].f[0];
+  br = ((v4sf_union*)vb)[0].f[0];
+  bi = ((v4sf_union*)vb)[1].f[0];
+  abr = ((v4sf_union*)vab)[0].f[0];
+  abi = ((v4sf_union*)vab)[1].f[0];
+
+#ifdef ZCONVOLVE_USING_INLINE_ASM /* inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc */
+  const float *a_ = a, *b_ = b; float *ab_ = ab;
+  int N = Ncvec;
+  asm volatile("mov         r8, %2                  \n"
+               "vdup.f32    q15, %4                 \n"
+               "1:                                  \n"
+               "pld         [%0,#64]                \n"
+               "pld         [%1,#64]                \n"
+               "pld         [%2,#64]                \n"
+               "pld         [%0,#96]                \n"
+               "pld         [%1,#96]                \n"
+               "pld         [%2,#96]                \n"
+               "vld1.f32    {q0,q1},   [%0,:128]!         \n"
+               "vld1.f32    {q4,q5},   [%1,:128]!         \n"
+               "vld1.f32    {q2,q3},   [%0,:128]!         \n"
+               "vld1.f32    {q6,q7},   [%1,:128]!         \n"
+               "vld1.f32    {q8,q9},   [r8,:128]!          \n"
+
+               "vmul.f32    q10, q0, q4             \n"
+               "vmul.f32    q11, q0, q5             \n"
+               "vmul.f32    q12, q2, q6             \n"
+               "vmul.f32    q13, q2, q7             \n"
+               "vmls.f32    q10, q1, q5             \n"
+               "vmla.f32    q11, q1, q4             \n"
+               "vld1.f32    {q0,q1}, [r8,:128]!     \n"
+               "vmls.f32    q12, q3, q7             \n"
+               "vmla.f32    q13, q3, q6             \n"
+               "vmla.f32    q8, q10, q15            \n"
+               "vmla.f32    q9, q11, q15            \n"
+               "vmla.f32    q0, q12, q15            \n"
+               "vmla.f32    q1, q13, q15            \n"
+               "vst1.f32    {q8,q9},[%2,:128]!    \n"
+               "vst1.f32    {q0,q1},[%2,:128]!    \n"
+               "subs        %3, #2                  \n"
+               "bne         1b                      \n"
+               : "+r"(a_), "+r"(b_), "+r"(ab_), "+r"(N) : "r"(scaling) : "r8", "q0","q1","q2","q3","q4","q5","q6","q7","q8","q9", "q10","q11","q12","q13","q15","memory");
+#else /* default routine, works fine for non-arm cpus with current compilers */
+  for (i=0; i < Ncvec; i += 2) {
+    v4sf ar, ai, br, bi;
+    ar = va[2*i+0]; ai = va[2*i+1];
+    br = vb[2*i+0]; bi = vb[2*i+1];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+0] = VMADD(ar, vscal, vab[2*i+0]);
+    vab[2*i+1] = VMADD(ai, vscal, vab[2*i+1]);
+    ar = va[2*i+2]; ai = va[2*i+3];
+    br = vb[2*i+2]; bi = vb[2*i+3];
+    VCPLXMUL(ar, ai, br, bi);
+    vab[2*i+2] = VMADD(ar, vscal, vab[2*i+2]);
+    vab[2*i+3] = VMADD(ai, vscal, vab[2*i+3]);
+  }
+#endif
+  if (s->transform == PFFFT_REAL) {
+    ((v4sf_union*)vab)[0].f[0] = abr + ar*br*scaling;
+    ((v4sf_union*)vab)[1].f[0] = abi + ai*bi*scaling;
+  }
+}
+#endif
+
+
+#else /* defined(PFFFT_SIMD_DISABLE) */
+
+/* standard routine using scalar floats, without SIMD stuff. */
+
+#define pffft_zreorder_nosimd pffft_zreorder
+static
+void pffft_zreorder_nosimd(PFFFT_Setup *setup, const float *in, float *out, pffft_direction_t direction) {
+  int k, N = setup->N;
+  if (setup->transform == PFFFT_COMPLEX) {
+    for (k=0; k < 2*N; ++k) out[k] = in[k];
+    return;
+  }
+  else if (direction == PFFFT_FORWARD) {
+    float x_N = in[N-1];
+    for (k=N-1; k > 1; --k) out[k] = in[k-1];
+    out[0] = in[0];
+    out[1] = x_N;
+  } else {
+    float x_N = in[1];
+    for (k=1; k < N-1; ++k) out[k] = in[k+1];
+    out[0] = in[0];
+    out[N-1] = x_N;
+  }
+}
+
+#define pffft_transform_internal_nosimd pffft_transform_internal
+static
+void pffft_transform_internal_nosimd(PFFFT_Setup *setup, const float *input, float *output, float *scratch,
+                                    pffft_direction_t direction, int ordered) {
+  int Ncvec   = setup->Ncvec;
+  int nf_odd = (setup->ifac[1] & 1);
+
+#if 0
+  /* temporary buffer is allocated on the stack if the scratch pointer is NULL */
+  int stack_allocate = (scratch == 0 ? Ncvec*2 : 1);
+  VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
+#endif
+  float *buff[2];
+  int ib;
+  /* if (scratch == 0) scratch = scratch_on_stack; */
+  buff[0] = output; buff[1] = scratch;
+
+  if (setup->transform == PFFFT_COMPLEX) ordered = 0; /* it is always ordered. */
+  ib = (nf_odd ^ ordered ? 1 : 0);
+
+  if (direction == PFFFT_FORWARD) {
+    if (setup->transform == PFFFT_REAL) {
+      ib = (rfftf1_ps(Ncvec*2, input, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+    } else {
+      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0], -1) == buff[0] ? 0 : 1);
+    }
+    if (ordered) {
+      pffft_zreorder(setup, buff[ib], buff[!ib], PFFFT_FORWARD); ib = !ib;
+    }
+  } else {
+    if (input == buff[ib]) {
+      ib = !ib; /* may happen when finput == foutput */
+    }
+    if (ordered) {
+      pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
+      input = buff[!ib];
+    }
+    if (setup->transform == PFFFT_REAL) {
+      ib = (rfftb1_ps(Ncvec*2, input, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0]) == buff[0] ? 0 : 1);
+    } else {
+      ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
+                      setup->twiddle, &setup->ifac[0], +1) == buff[0] ? 0 : 1);
+    }
+  }
+  if (buff[ib] != output) {
+    int k;
+    /* extra copy required -- this situation should happens only when finput == foutput */
+    assert(input==output);
+    for (k=0; k < Ncvec; ++k) {
+      float a = buff[ib][2*k], b = buff[ib][2*k+1];
+      output[2*k] = a; output[2*k+1] = b;
+    }
+    ib = !ib;
+  }
+  assert(buff[ib] == output);
+}
+
+#if 0
+#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
+void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup *s, const float *a, const float *b,
+                                       float *ab, float scaling) {
+  int i, Ncvec = s->Ncvec;
+
+  if (s->transform == PFFFT_REAL) {
+    /* take care of the fftpack ordering */
+    ab[0] += a[0]*b[0]*scaling;
+    ab[2*Ncvec-1] += a[2*Ncvec-1]*b[2*Ncvec-1]*scaling;
+    ++ab; ++a; ++b; --Ncvec;
+  }
+  for (i=0; i < Ncvec; ++i) {
+    float ar, ai, br, bi;
+    ar = a[2*i+0]; ai = a[2*i+1];
+    br = b[2*i+0]; bi = b[2*i+1];
+    VCPLXMUL(ar, ai, br, bi);
+    ab[2*i+0] += ar*scaling;
+    ab[2*i+1] += ai*scaling;
+  }
+}
+#endif
+
+#endif /* defined(PFFFT_SIMD_DISABLE) */
+
+static
+void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
+  pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 0);
+}
+
+static
+void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction) {
+  pffft_transform_internal(setup, input, output, (v4sf*)work, direction, 1);
+}
+
+#endif
diff --git a/src/pffft.h b/src/pffft.h
new file mode 100644
index 0000000..63522ca
--- /dev/null
+++ b/src/pffft.h
@@ -0,0 +1,197 @@
+/* https://bitbucket.org/jpommier/pffft/raw/483453d8f7661058e74aa4e7cf5c27bcd7887e7a/pffft.h
+ * with minor changes for libsoxr. */
+
+#if !defined PFFT_MACROS_ONLY
+
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
+   authored by Dr Paul Swarztrauber of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+/*
+   PFFFT : a Pretty Fast FFT.
+
+   This is basically an adaptation of the single precision fftpack
+   (v4) as found on netlib taking advantage of SIMD instruction found
+   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
+
+   For architectures where no SIMD instruction is available, the code
+   falls back to a scalar version.
+
+   Restrictions:
+
+   - 1D transforms only, with 32-bit single precision.
+
+   - supports only transforms for inputs of length N of the form
+   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
+   144, 160, etc are all acceptable lengths). Performance is best for
+   128<=N<=8192.
+
+   - all (float*) pointers in the functions below are expected to
+   have an "simd-compatible" alignment, that is 16 bytes on x86 and
+   powerpc CPUs.
+
+   You can allocate such buffers with the functions
+   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+   posix_memalign..)
+
+*/
+
+#ifndef PFFFT_H
+#define PFFFT_H
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if PFFFT_DOUBLE
+#define float double
+#endif
+
+  /* opaque struct holding internal stuff (precomputed twiddle factors)
+     this struct can be shared by many threads as it contains only
+     read-only data.
+  */
+  typedef struct PFFFT_Setup PFFFT_Setup;
+
+  /* direction of the transform */
+  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
+
+  /* type of transform */
+  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
+
+  /*
+    prepare for performing transforms of size N -- the returned
+    PFFFT_Setup structure is read-only so it can safely be shared by
+    multiple concurrent threads.
+  */
+  static
+  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
+  static
+  void pffft_destroy_setup(PFFFT_Setup *);
+  /*
+     Perform a Fourier transform , The z-domain data is stored in the
+     most efficient order for transforming it back, or using it for
+     convolution. If you need to have its content sorted in the
+     "usual" way, that is as an array of interleaved complex numbers,
+     either use pffft_transform_ordered , or call pffft_zreorder after
+     the forward fft, and before the backward fft.
+
+     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
+     Typically you will want to scale the backward transform by 1/N.
+
+     The 'work' pointer should point to an area of N (2*N for complex
+     fft) floats, properly aligned. If 'work' is NULL, then stack will
+     be used instead (this is probably the best strategy for small
+     FFTs, say for N < 16384).
+
+     input and output may alias.
+  */
+  static
+  void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /*
+     Similar to pffft_transform, but makes sure that the output is
+     ordered as expected (interleaved complex numbers).  This is
+     similar to calling pffft_transform and then pffft_zreorder.
+
+     input and output may alias.
+  */
+  static
+  void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /*
+     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
+     PFFFT_FORWARD) if you want to have the frequency components in
+     the correct "canonical" order, as interleaved complex numbers.
+
+     (for real transforms, both 0-frequency and half frequency
+     components, which are real, are assembled in the first entry as
+     F(0)+i*F(n/2+1). Note that the original fftpack did place
+     F(n/2+1) at the end of the arrays).
+
+     input and output should not alias.
+  */
+  static
+  void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
+
+  /*
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and accumulate them into dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+
+     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
+
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+
+  /*
+    the float buffers must have the correct alignment (16-byte boundary
+    on intel and powerpc). This function may be used to obtain such
+    correctly aligned buffers.
+  */
+#if 0
+  void *pffft_aligned_malloc(size_t nb_bytes);
+  void pffft_aligned_free(void *);
+
+  /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
+  int pffft_simd_size();
+#endif
+
+#undef float
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
+#endif
diff --git a/src/pffft32.c b/src/pffft32.c
new file mode 100644
index 0000000..f480809
--- /dev/null
+++ b/src/pffft32.c
@@ -0,0 +1,39 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define SIMD_ALIGNED_FREE free
+#define SIMD_ALIGNED_MALLOC malloc
+#define PFFFT_SIMD_DISABLE
+#define PFFFT_DOUBLE 0
+#include "pffft-wrap.c"
+
+#include "filter.h"
+#include "rdft_t.h"
+
+static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
+static void delete_setup(void * setup) {pffft_destroy_setup(setup);}
+static void forward  (int length, void * setup, float * h, float * scratch) {pffft_transform        (setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void oforward (int length, void * setup, float * h, float * scratch) {pffft_transform_ordered(setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void backward (int length, void * setup, float * H, float * scratch) {pffft_transform        (setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void obackward(int length, void * setup, float * H, float * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void convolve(int length, void * setup, float * H, float const * with) { pffft_zconvolve(setup, H, with, H);  (void)length;}
+static int multiplier(void) {return 1;}
+static int flags(void) {return RDFT_NEEDS_SCRATCH;}
+
+fn_t _soxr_rdft32_cb[] = {
+  (fn_t)setup,
+  (fn_t)setup,
+  (fn_t)delete_setup,
+  (fn_t)forward,
+  (fn_t)oforward,
+  (fn_t)backward,
+  (fn_t)obackward,
+  (fn_t)convolve,
+  (fn_t)_soxr_ordered_partial_convolve_f,
+  (fn_t)multiplier,
+  (fn_t)pffft_reorder_back,
+  (fn_t)malloc,
+  (fn_t)calloc,
+  (fn_t)free,
+  (fn_t)flags,
+};
diff --git a/src/pffft32s.c b/src/pffft32s.c
new file mode 100644
index 0000000..7798a45
--- /dev/null
+++ b/src/pffft32s.c
@@ -0,0 +1,34 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define PFFFT_DOUBLE 0
+#include "pffft-wrap.c"
+
+#include "rdft_t.h"
+
+static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
+static void forward  (int length, void * setup, float * h, float * scratch) {pffft_transform        (setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void oforward (int length, void * setup, float * h, float * scratch) {pffft_transform_ordered(setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void backward (int length, void * setup, float * H, float * scratch) {pffft_transform        (setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void obackward(int length, void * setup, float * H, float * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void convolve(int length, void * setup, float * H, float const * with) {pffft_zconvolve(setup, H, with, H); (void)length;}
+static int multiplier(void) {return 1;}
+static int flags(void) {return RDFT_IS_SIMD | RDFT_NEEDS_SCRATCH;}
+
+fn_t _soxr_rdft32s_cb[] = {
+  (fn_t)setup,
+  (fn_t)setup,
+  (fn_t)pffft_destroy_setup,
+  (fn_t)forward,
+  (fn_t)oforward,
+  (fn_t)backward,
+  (fn_t)obackward,
+  (fn_t)convolve,
+  (fn_t)ORDERED_PARTIAL_CONVOLVE_SIMD,
+  (fn_t)multiplier,
+  (fn_t)pffft_reorder_back,
+  (fn_t)SIMD_ALIGNED_MALLOC,
+  (fn_t)SIMD_ALIGNED_CALLOC,
+  (fn_t)SIMD_ALIGNED_FREE,
+  (fn_t)flags,
+};
diff --git a/src/pffft64s.c b/src/pffft64s.c
new file mode 100644
index 0000000..7c37c9d
--- /dev/null
+++ b/src/pffft64s.c
@@ -0,0 +1,34 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define PFFFT_DOUBLE 1
+#include "pffft-wrap.c"
+
+#include "rdft_t.h"
+
+static void * setup(int len) {return pffft_new_setup(len, PFFFT_REAL);}
+static void forward  (int length, void * setup, double * h, double * scratch) {pffft_transform        (setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void oforward (int length, void * setup, double * h, double * scratch) {pffft_transform_ordered(setup, h, h, scratch, PFFFT_FORWARD); (void)length;}
+static void backward (int length, void * setup, double * H, double * scratch) {pffft_transform        (setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void obackward(int length, void * setup, double * H, double * scratch) {pffft_transform_ordered(setup, H, H, scratch, PFFFT_BACKWARD);(void)length;}
+static void convolve(int length, void * setup, double * H, double const * with) {pffft_zconvolve(setup, H, with, H); (void)length;}
+static int multiplier(void) {return 1;}
+static int flags(void) {return RDFT_IS_SIMD | RDFT_NEEDS_SCRATCH;}
+
+fn_t _soxr_rdft64s_cb[] = {
+  (fn_t)setup,
+  (fn_t)setup,
+  (fn_t)pffft_destroy_setup,
+  (fn_t)forward,
+  (fn_t)oforward,
+  (fn_t)backward,
+  (fn_t)obackward,
+  (fn_t)convolve,
+  (fn_t)ORDERED_PARTIAL_CONVOLVE_SIMD,
+  (fn_t)multiplier,
+  (fn_t)pffft_reorder_back,
+  (fn_t)SIMD_ALIGNED_MALLOC,
+  (fn_t)SIMD_ALIGNED_CALLOC,
+  (fn_t)SIMD_ALIGNED_FREE,
+  (fn_t)flags,
+};
diff --git a/src/poly-fir.h b/src/poly-fir.h
new file mode 100644
index 0000000..d138e03
--- /dev/null
+++ b/src/poly-fir.h
@@ -0,0 +1,150 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Resample using an interpolated poly-phase FIR with length LEN. */
+/* Input must be followed by FIR_LENGTH-1 samples. */
+
+#if COEF_INTERP != 1 && COEF_INTERP != 2 && COEF_INTERP != 3
+  #error COEF_INTERP
+#endif
+
+#if SIMD_AVX || SIMD_SSE || SIMD_NEON
+  #define N (FIR_LENGTH>>2)
+
+  #if COEF_INTERP == 1
+    #define _ sum=vMac(vMac(b,X,a),vLdu(in+j*4),sum), ++j;
+  #elif COEF_INTERP == 2
+    #define _ sum=vMac(vMac(vMac(c,X,b),X,a),vLdu(in+j*4),sum), ++j;
+  #else
+    #define _ sum=vMac(vMac(vMac(vMac(d,X,c),X,b),X,a),vLdu(in+j*4),sum), ++j;
+  #endif
+
+  #define a coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-0)]
+  #define b coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-1)]
+  #define c coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-2)]
+  #define d coefs[(COEF_INTERP+1)*(N*phase+j)+(COEF_INTERP-3)]
+
+  #define BEGINNING v4_t X = vLds(x), sum = vZero(); \
+      v4_t const * const __restrict coefs = (v4_t *)COEFS
+  #define END vStorSum(output+i, sum)
+  #define cc(n) case n: core(n); break
+  #define CORE(n) switch (n) {cc(2); cc(3); cc(4); cc(5); cc(6); default: core(n);}
+#else
+  #define N FIR_LENGTH
+
+  #if COEF_INTERP == 1
+    #define _ sum += (b*x + a)*in[j], ++j;
+  #elif COEF_INTERP == 2
+    #define _ sum += ((c*x + b)*x + a)*in[j], ++j;
+  #else
+    #define _ sum += (((d*x + c)*x + b)*x + a)*in[j], ++j;
+  #endif
+
+  #define a (coef(COEFS, COEF_INTERP, N, phase, 0,j))
+  #define b (coef(COEFS, COEF_INTERP, N, phase, 1,j))
+  #define c (coef(COEFS, COEF_INTERP, N, phase, 2,j))
+  #define d (coef(COEFS, COEF_INTERP, N, phase, 3,j))
+
+  #define BEGINNING sample_t sum = 0
+  #define END output[i] = sum
+  #define CORE(n) core(n)
+#endif
+
+
+
+#define floatPrecCore(n) { \
+  float_step_t at = p->at.flt; \
+  for (i = 0; (int)at < num_in; ++i, at += p->step.flt) { \
+    sample_t const * const __restrict in = input + (int)at; \
+    float_step_t frac = at - (int)at; \
+    int phase = (int)(frac * (1 << PHASE_BITS)); \
+    sample_t x = (sample_t)(frac * (1 << PHASE_BITS) - phase); \
+    int j = 0; \
+    BEGINNING; CONVOLVE(n); END; \
+  } \
+  fifo_read(&p->fifo, (int)at, NULL); \
+  p->at.flt = at - (int)at; } /* Could round to 1 in some cirmcumstances. */
+
+
+
+#define highPrecCore(n) { \
+  step_t at; at.fix = p->at.fix; \
+  for (i = 0; at.integer < num_in; ++i, \
+      at.fix.ls.all += p->step.fix.ls.all, \
+      at.whole += p->step.whole + (at.fix.ls.all < p->step.fix.ls.all)) { \
+    sample_t const * const __restrict in = input + at.integer; \
+    uint32_t frac = at.fraction; \
+    int phase = (int)(frac >> (32 - PHASE_BITS)); /* High-order bits */ \
+    /* Low-order bits, scaled to [0,1): */ \
+    sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); \
+    int j = 0; \
+    BEGINNING; CONVOLVE(n); END; \
+  } \
+  fifo_read(&p->fifo, at.integer, NULL); \
+  p->at.whole = at.fraction; \
+  p->at.fix.ls = at.fix.ls; }
+
+
+
+#define stdPrecCore(n) { \
+  int64p_t at; at.all = p->at.whole; \
+  for (i = 0; at.parts.ms < num_in; ++i, at.all += p->step.whole) { \
+    sample_t const * const __restrict in = input + at.parts.ms; \
+    uint32_t const frac = at.parts.ls; \
+    int phase = (int)(frac >> (32 - PHASE_BITS)); /* high-order bits */ \
+    /* Low-order bits, scaled to [0,1): */ \
+    sample_t x = (sample_t)((frac << PHASE_BITS) * (1 / MULT32)); \
+    int j = 0; \
+    BEGINNING; CONVOLVE(n); END; \
+  } \
+  fifo_read(&p->fifo, at.parts.ms, NULL); \
+  p->at.whole = at.parts.ls; }
+
+
+
+#if WITH_FLOAT_STD_PREC_CLOCK
+  #define SPCORE floatPrecCore
+#else
+  #define SPCORE stdPrecCore
+#endif
+
+
+
+#if WITH_HI_PREC_CLOCK
+  #define core(n) if (p->use_hi_prec_clock) highPrecCore(n) else SPCORE(n)
+#else
+  #define core(n) SPCORE(n)
+#endif
+
+
+
+static void FUNCTION(stage_t * p, fifo_t * output_fifo)
+{
+  sample_t const * input = stage_read_p(p);
+  int num_in = min(stage_occupancy(p), p->input_size);
+  int i, max_num_out = 1 + (int)(num_in * p->out_in_ratio);
+  sample_t * const __restrict output = fifo_reserve(output_fifo, max_num_out);
+
+  CORE(N);
+  assert(max_num_out - i >= 0);
+  fifo_trim_by(output_fifo, max_num_out - i);
+}
+
+
+
+#undef _
+#undef a
+#undef b
+#undef c
+#undef d
+#undef CORE
+#undef cc
+#undef core
+#undef COEF_INTERP
+#undef N
+#undef BEGINNING
+#undef END
+#undef CONVOLVE
+#undef FIR_LENGTH
+#undef FUNCTION
+#undef PHASE_BITS
diff --git a/src/poly-fir0.h b/src/poly-fir0.h
new file mode 100644
index 0000000..76fca2d
--- /dev/null
+++ b/src/poly-fir0.h
@@ -0,0 +1,56 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Resample using a non-interpolated poly-phase FIR with length LEN. */
+/* Input must be followed by FIR_LENGTH-1 samples. */
+
+#if SIMD_AVX || SIMD_SSE || SIMD_NEON
+  #define N (FIR_LENGTH>>2)
+  #define BEGINNING v4_t sum = vZero(); \
+      v4_t const * const __restrict coefs = (v4_t *)COEFS + N * rem;
+  #define _ sum = vMac(vLdu(at+j*4), coefs[j], sum), ++j;
+  #define END vStorSum(output+i, sum)
+  #define cc(n) case n: core(n); break
+  #define CORE(n) switch (n) {cc(2); cc(3); cc(4); cc(5); cc(6); default: core(n);}
+#else
+  #define N FIR_LENGTH
+  #define BEGINNING sample_t sum = 0; \
+      sample_t const * const __restrict coefs = (sample_t *)COEFS + N * rem;
+  #define _ sum += coefs[j]*at[j], ++j;
+  #define END output[i] = sum
+  #define CORE(n) core(n)
+#endif
+
+#define core(n) \
+  for (i = 0; at < num_in * p->L; ++i, at += step) { \
+    int const div = at / p->L, rem = at % p->L; \
+    sample_t const * const __restrict at = input + div; \
+    int j = 0; BEGINNING; CONVOLVE(n); END;}
+
+static void FUNCTION(stage_t * p, fifo_t * output_fifo)
+{
+  int num_in = min(stage_occupancy(p), p->input_size);
+  if (num_in) {
+    sample_t const * input = stage_read_p(p);
+    int at = p->at.integer, step = p->step.integer;
+    int i, num_out = (num_in * p->L - at + step - 1) / step;
+    sample_t * __restrict output = fifo_reserve(output_fifo, num_out);
+
+    CORE(N);
+    assert(i == num_out);
+    fifo_read(&p->fifo, at / p->L, NULL);
+    p->at.integer = at % p->L;
+  }
+}
+
+#undef _
+#undef CORE
+#undef cc
+#undef core
+#undef N
+#undef BEGINNING
+#undef MIDDLE
+#undef END
+#undef CONVOLVE
+#undef FIR_LENGTH
+#undef FUNCTION
diff --git a/src/rdft.h b/src/rdft.h
new file mode 100644
index 0000000..59ba174
--- /dev/null
+++ b/src/rdft.h
@@ -0,0 +1,31 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+void ORDERED_CONVOLVE(int n, void * not_used, DFT_FLOAT * a, const DFT_FLOAT * b)
+{
+  int i;
+  a[0] *= b[0];
+  a[1] *= b[1];
+  for (i = 2; i < n; i += 2) {
+    DFT_FLOAT tmp = a[i];
+    a[i  ] = b[i  ] * tmp - b[i+1] * a[i+1];
+    a[i+1] = b[i+1] * tmp + b[i  ] * a[i+1];
+  }
+  (void)not_used;
+}
+
+void ORDERED_PARTIAL_CONVOLVE(int n, DFT_FLOAT * a, const DFT_FLOAT * b)
+{
+  int i;
+  a[0] *= b[0];
+  for (i = 2; i < n; i += 2) {
+    DFT_FLOAT tmp = a[i];
+    a[i  ] = b[i  ] * tmp - b[i+1] * a[i+1];
+    a[i+1] = b[i+1] * tmp + b[i  ] * a[i+1];
+  }
+  a[1] = b[i] * a[i] - b[i+1] * a[i+1];
+}
+
+#undef ORDERED_CONVOLVE
+#undef ORDERED_PARTIAL_CONVOLVE
+#undef DFT_FLOAT
diff --git a/src/rdft_t.h b/src/rdft_t.h
new file mode 100644
index 0000000..293d9c3
--- /dev/null
+++ b/src/rdft_t.h
@@ -0,0 +1,24 @@
+/* SoX Resampler Library      Copyright (c) 2007-13 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+typedef void (* fn_t)(void);
+
+#define rdft_forward_setup    (*(void * (*)(int))RDFT_CB[0])
+#define rdft_backward_setup   (*(void * (*)(int))RDFT_CB[1])
+#define rdft_delete_setup     (*(void (*)(void *))RDFT_CB[2])
+#define rdft_forward          (*(void (*)(int, void *, void *, void *))RDFT_CB[3])
+#define rdft_oforward         (*(void (*)(int, void *, void *, void *))RDFT_CB[4])
+#define rdft_backward         (*(void (*)(int, void *, void *, void *))RDFT_CB[5])
+#define rdft_obackward        (*(void (*)(int, void *, void *, void *))RDFT_CB[6])
+#define rdft_convolve         (*(void (*)(int, void *, void *, void const *))RDFT_CB[7])
+#define rdft_convolve_portion (*(void (*)(int, void *, void const *))RDFT_CB[8])
+#define rdft_multiplier       (*(int (*)(void))RDFT_CB[9])
+#define rdft_reorder_back     (*(void (*)(int, void *, void *, void *))RDFT_CB[10])
+#define rdft_malloc           (*(void * (*)(size_t))RDFT_CB[11])
+#define rdft_calloc           (*(void * (*)(size_t, size_t))RDFT_CB[12])
+#define rdft_free             (*(void (*)(void *))RDFT_CB[13])
+#define rdft_flags            (*(int (*)(void))RDFT_CB[14])
+
+/* Flag templates: */
+#define RDFT_IS_SIMD       1
+#define RDFT_NEEDS_SCRATCH 2
diff --git a/src/rint-clip.h b/src/rint-clip.h
new file mode 100644
index 0000000..bfb6458
--- /dev/null
+++ b/src/rint-clip.h
@@ -0,0 +1,158 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if defined DITHER
+
+#define DITHERING + (1./32)*(int)(((ran1>>=3)&31)-((ran2>>=3)&31))
+#define DITHER_RAND (seed = 1664525UL * seed + 1013904223UL) >> 3
+#define DITHER_VARS unsigned long ran1 = DITHER_RAND, ran2 = DITHER_RAND
+#define SEED_ARG , unsigned long * seed0
+#define SAVE_SEED *seed0 = seed
+#define COPY_SEED unsigned long seed = *seed0;
+#define COPY_SEED1 unsigned long seed1 = seed
+#define PASS_SEED1 , &seed1
+#define PASS_SEED  , &seed
+#define FLOATD double
+
+#else
+
+#define DITHERING
+#define DITHER_VARS
+#define SEED_ARG
+#define SAVE_SEED
+#define COPY_SEED
+#define COPY_SEED1
+#define PASS_SEED1
+#define PASS_SEED
+#define FLOATD FLOATX
+
+#endif
+
+#define DO_16 _;_;_;_;_;_;_;_;_;_;_;_;_;_;_;_
+
+
+
+#if defined FE_INVALID && defined FPU_RINT
+static void RINT_CLIP(RINT_T * const dest, FLOATX const * const src,
+    unsigned stride, size_t i, size_t const n, size_t * const clips SEED_ARG)
+{
+  COPY_SEED
+  DITHER_VARS;
+  for (; i < n; ++i) {
+    FLOATD const d = src[i] DITHERING;
+    RINT(dest[stride * i], d);
+    if (fe_test_invalid()) {
+      fe_clear_invalid();
+      dest[stride * i] = d > 0? RINT_MAX : -RINT_MAX - 1;
+      ++*clips;
+    }
+  }
+  SAVE_SEED;
+}
+#endif
+
+
+
+static size_t LSX_RINT_CLIP(void * * const dest0, FLOATX const * const src,
+    size_t const n SEED_ARG)
+{
+  size_t i, clips = 0;
+  RINT_T * dest = *dest0;
+  COPY_SEED
+#if defined FE_INVALID && defined FPU_RINT
+#define _ RINT(dest[i], src[i] DITHERING); ++i
+  for (i = 0; i < (n & ~15u);) {
+    COPY_SEED1;
+    DITHER_VARS;
+    DO_16;
+    if (fe_test_invalid()) {
+      fe_clear_invalid();
+      RINT_CLIP(dest, src, 1, i - 16, i, &clips PASS_SEED1);
+    }
+  }
+  RINT_CLIP(dest, src, 1, i, n, &clips PASS_SEED);
+#else
+#define _ d = src[i] DITHERING, dest[i++] = (RINT_T)(d > 0? \
+    d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5)
+  const double N = 1. + RINT_MAX;
+  double d;
+  for (i = 0; i < (n & ~15u);) {
+    DITHER_VARS;
+    DO_16;
+  }
+  {
+    DITHER_VARS;
+    for (; i < n; _);
+  }
+#endif
+  SAVE_SEED;
+  *dest0 = dest + n;
+  return clips;
+}
+#undef _
+
+
+
+static size_t LSX_RINT_CLIP_2(void * * dest0, FLOATX const * const * srcs,
+    unsigned const stride, size_t const n SEED_ARG)
+{
+  unsigned j;
+  size_t i, clips = 0;
+  RINT_T * dest = *dest0;
+  COPY_SEED
+#if defined FE_INVALID && defined FPU_RINT
+#define _ RINT(dest[stride * i], src[i] DITHERING); ++i
+  for (j = 0; j < stride; ++j, ++dest) {
+    FLOATX const * const src = srcs[j];
+    for (i = 0; i < (n & ~15u);) {
+      COPY_SEED1;
+      DITHER_VARS;
+      DO_16;
+      if (fe_test_invalid()) {
+        fe_clear_invalid();
+        RINT_CLIP(dest, src, stride, i - 16, i, &clips PASS_SEED1);
+      }
+    }
+    RINT_CLIP(dest, src, stride, i, n, &clips PASS_SEED);
+  }
+#else
+#define _ d = src[i] DITHERING, dest[stride * i++] = (RINT_T)(d > 0? \
+    d+.5 >= N? ++clips, N-1 : d+.5 : d-.5 <= -N-1? ++clips, -N:d-.5)
+  const double N = 1. + RINT_MAX;
+  double d;
+  for (j = 0; j < stride; ++j, ++dest) {
+    FLOATX const * const src = srcs[j];
+    for (i = 0; i < (n & ~15u);) {
+      DITHER_VARS;
+      DO_16;
+    }
+    {
+      DITHER_VARS;
+      for (; i < n; _);
+    }
+  }
+#endif
+  SAVE_SEED;
+  *dest0 = dest + stride * (n - 1);
+  return clips;
+}
+#undef _
+
+#undef FLOATD
+#undef PASS_SEED
+#undef PASS_SEED1
+#undef COPY_SEED1
+#undef COPY_SEED
+#undef SAVE_SEED
+#undef SEED_ARG
+#undef DITHER_VARS
+#undef DITHERING
+#undef DITHER
+
+#undef RINT_MAX
+#undef RINT_T
+#undef FPU_RINT
+#undef RINT
+#undef RINT_CLIP
+#undef LSX_RINT_CLIP
+#undef LSX_RINT_CLIP_2
diff --git a/src/rint.h b/src/rint.h
new file mode 100644
index 0000000..2f1dfbe
--- /dev/null
+++ b/src/rint.h
@@ -0,0 +1,102 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_rint_included
+#define soxr_rint_included
+
+#include "std-types.h"
+
+/* For x86, compiler-supplied versions of these functions (where available)
+ * can have poor performance (e.g. mingw32), so prefer these asm versions: */
+
+#if defined __GNUC__ && (defined __i386__ || defined __x86_64__)
+  #define FPU_RINT32
+  #define FPU_RINT16
+  #define rint32D(a,b) __asm__ __volatile__("fistpl %0": "=m"(a): "t"(b): "st")
+  #define rint16D(a,b) __asm__ __volatile__("fistps %0": "=m"(a): "t"(b): "st")
+  #define rint32F rint32D
+  #define rint16F rint16D
+  #define FE_INVALID 1
+  static __inline int fe_test_invalid(void) {
+    int status_word;
+    __asm__ __volatile__("fnstsw %%ax": "=a"(status_word));
+    return status_word & FE_INVALID;
+  }
+  static __inline int fe_clear_invalid(void) {
+    int32_t status[7];
+    __asm__ __volatile__("fnstenv %0": "=m"(status));
+    status[1] &= ~FE_INVALID;
+    __asm__ __volatile__("fldenv %0": : "m"(*status));
+    return 0;
+  }
+#elif defined _MSC_VER && defined _M_IX86
+  #define FPU_RINT32
+  #define FPU_RINT16
+  #define rint_fn(N,Y,X) \
+    static __inline void N(Y *y, X x) {Y t; {__asm fld x __asm fistp t} *y=t;}
+  rint_fn(rint32d, int32_t, double)
+  rint_fn(rint32f, int32_t, float )
+  rint_fn(rint16d, int16_t, double)
+  rint_fn(rint16f, int16_t, float )
+  #define rint32D(y,x) rint32d(&(y),x)
+  #define rint32F(y,x) rint32f(&(y),x)
+  #define rint16D(y,x) rint16d(&(y),x)
+  #define rint16F(y,x) rint16f(&(y),x)
+  #define FE_INVALID 1
+  static __inline int fe_test_invalid(void) {
+    short status_word;
+    __asm fnstsw status_word
+    return status_word & FE_INVALID;
+  }
+  static __inline int fe_clear_invalid(void) {
+    int32_t status[7];
+    __asm fnstenv status
+    status[1] &= ~FE_INVALID;
+    __asm fldenv status
+    return 0;
+  }
+#elif defined _MSC_VER && defined _M_X64
+  #include <emmintrin.h>
+  #include <float.h>
+  #define FPU_RINT32
+  #define FPU_RINT16
+  static __inline void rint32d(int32_t *y, double x) {
+    *y = _mm_cvtsd_si32(_mm_load_sd(&x));}
+  static __inline void rint32f(int32_t *y, float  x) {
+    *y = _mm_cvtss_si32(_mm_load_ss(&x));}
+  static __inline void rint16d(int16_t *y, double x) {
+    x = x*65536+32738; *y = (int16_t)(_mm_cvtsd_si32(_mm_load_sd(&x)) >> 16);}
+  #define rint32D(y,x) rint32d(&(y),x)
+  #define rint32F(y,x) rint32f(&(y),x)
+  #define rint16D(y,x) rint16d(&(y),x)
+  #define rint16F(y,x) rint16d(&(y),(double)(x))
+  #define FE_INVALID 1
+  #define fe_test_invalid() (_statusfp() & _SW_INVALID)
+  #define fe_clear_invalid _clearfp /* Note: clears all. */
+#elif HAVE_LRINT && LONG_MAX == 2147483647L && HAVE_FENV_H
+  #include <math.h>
+  #include <fenv.h>
+  #define FPU_RINT32
+  #define rint32D(y,x) ((y)=lrint(x))
+  #define rint32F(y,x) ((y)=lrintf(x))
+  #define fe_test_invalid() fetestexcept(FE_INVALID)
+  #define fe_clear_invalid() feclearexcept(FE_INVALID)
+#endif
+
+#if !defined FPU_RINT32
+  #define rint32D(y,x) ((y)=(int32_t)((x) < 0? x - .5 : x + .5))
+  #define rint32F(y,x) rint32D(y,(double)(x))
+#endif
+
+#if !defined FPU_RINT16
+  #define rint16D(y,x) ((y)=(int16_t)((x) < 0? x - .5 : x + .5))
+  #define rint16F(y,x) rint16D(y,(double)(x))
+#endif
+
+static __inline int32_t rint32(double input) {
+  int32_t result; rint32D(result, input); return result;}
+
+static __inline int16_t rint16(double input) {
+  int16_t result; rint16D(result, input); return result;}
+
+#endif
diff --git a/src/samplerate.h b/src/samplerate.h
new file mode 100644
index 0000000..911cc5d
--- /dev/null
+++ b/src/samplerate.h
@@ -0,0 +1 @@
+#include "soxr-lsr.h"
diff --git a/src/soxr-lsr.c b/src/soxr-lsr.c
new file mode 100644
index 0000000..58ab50a
--- /dev/null
+++ b/src/soxr-lsr.c
@@ -0,0 +1,198 @@
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Wrapper mostly compatible with `libsamplerate'. */
+
+#include <assert.h>
+#include <stdlib.h>
+#include "soxr.h"
+#include "soxr-lsr.h"
+#include "rint.h"
+
+
+
+SRC_STATE *src_new(SRC_SRCTYPE id, int channels, SRC_ERROR * error)
+{
+  return src_callback_new(0, id, channels, error, 0);
+}
+
+
+
+SRC_ERROR src_process(SRC_STATE *p, SRC_DATA * io)
+{
+  size_t idone , odone;
+
+  if (!p || !io) return -1;
+
+  soxr_set_error(
+      p, soxr_set_io_ratio(p, 1/io->src_ratio, (size_t)io->output_frames));
+
+  soxr_process(p, io->data_in,                                  /* hack: */
+      (size_t)(io->end_of_input? ~io->input_frames : io->input_frames),
+      &idone, io->data_out, (size_t)io->output_frames, &odone);
+
+  io->input_frames_used = (long)idone, io->output_frames_gen = (long)odone;
+  return -!!soxr_error(p);
+}
+
+
+
+SRC_ERROR src_set_ratio(SRC_STATE * p, double oi_ratio)
+{
+  return -!!soxr_set_io_ratio(p, 1/oi_ratio, 0);
+}
+
+
+
+SRC_ERROR src_reset(SRC_STATE * p)
+{
+  return -!!soxr_clear(p);
+}
+
+
+
+SRC_ERROR src_error(SRC_STATE * p)
+{
+  return -!!soxr_error(p);
+}
+
+
+
+SRC_STATE * src_delete(SRC_STATE * p)
+{
+  soxr_delete(p);
+  return 0;
+}
+
+
+
+SRC_STATE *src_callback_new(src_callback_t fn,
+    SRC_SRCTYPE id, int channels, SRC_ERROR * error0, void * p)
+{
+  soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + (unsigned)id, 0);
+  char const * e = getenv("SOXR_LSR_NUM_THREADS");
+  soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1));
+  soxr_error_t error;
+  soxr_t soxr = 0;
+
+  assert (channels > 0);
+  soxr = soxr_create(0, 0, (unsigned)channels, &error, 0, &q_spec, &r_spec);
+
+  if (soxr)
+    error = soxr_set_input_fn(soxr, (soxr_input_fn_t)fn, p, 0);
+
+  if (error0)
+    *error0 = -!!error;
+
+  return soxr;
+}
+
+
+
+long src_callback_read(SRC_STATE *p, double oi_ratio, long olen, float * obuf)
+{
+  if (!p || olen < 0) return -1;
+
+  soxr_set_error(p, soxr_set_io_ratio(p, 1/oi_ratio, (size_t)olen));
+  return (long)soxr_output(p, obuf, (size_t)olen);
+}
+
+
+
+SRC_ERROR src_simple(SRC_DATA * io, SRC_SRCTYPE id, int channels)
+{
+  size_t idone, odone;
+  soxr_error_t error;
+  soxr_quality_spec_t q_spec = soxr_quality_spec(SOXR_LSR0Q + (unsigned)id, 0);
+  char const * e = getenv("SOXR_LSR_NUM_THREADS");
+  soxr_runtime_spec_t r_spec = soxr_runtime_spec(!(e && atoi(e) != 1));
+
+  if (!io || channels<=0 || io->input_frames<0 || io->output_frames<0) return-1;
+
+  error = soxr_oneshot(1, io->src_ratio, (unsigned)channels, io->data_in,
+      (size_t)io->input_frames, &idone, io->data_out, (size_t)io->output_frames,
+      &odone, 0, &q_spec, &r_spec);
+
+  io->input_frames_used = (long)idone, io->output_frames_gen = (long)odone;
+
+  return -!!error;
+}
+
+
+
+char const * src_get_name(SRC_SRCTYPE id)
+{
+  static char const * const names[] = {
+    "LSR best sinc", "LSR medium sinc", "LSR fastest sinc",
+    "LSR ZOH", "LSR linear", "SoX VHQ"};
+
+  return (unsigned)id < 5u + !getenv("SOXR_LSR_STRICT")? names[id] : 0;
+}
+
+
+
+char const * src_get_description(SRC_SRCTYPE id)
+{
+  return src_get_name(id);
+}
+
+
+
+char const * src_get_version(void)
+{
+  return soxr_version();
+}
+
+
+
+char const * src_strerror(SRC_ERROR error)
+{
+  return error == 1? "Placeholder." : error ? "soxr error" : soxr_strerror(0);
+}
+
+
+
+int src_is_valid_ratio(double oi_ratio)
+{
+  return getenv("SOXR_LSR_STRICT")?
+    oi_ratio >= 1./256 && oi_ratio <= 256 : oi_ratio > 0;
+}
+
+
+
+void src_short_to_float_array(short const * src, float * dest, int len)
+{
+  assert (src && dest);
+
+  while (len--) dest[len] = (float)(src[len] * (1 / (1. + SHRT_MAX)));
+}
+
+
+
+void src_float_to_short_array(float const * src, short * dest, int len)
+{
+  double d, N = 1. + SHRT_MAX;
+  assert (src && dest);
+
+  while (len--) d = src[len] * N, dest[len] =
+    (short)(d > N - 1? (short)(N - 1) : d < -N? (short)-N : rint16(d));
+}
+
+
+
+void src_int_to_float_array(int const * src, float * dest, int len)
+{
+  assert (src && dest);
+  while (len--) dest[len] = (float)(src[len] * (1 / (32768. * 65536.)));
+}
+
+
+
+void src_float_to_int_array(float const * src, int * dest, int len)
+{
+  double d, N = 32768. * 65536.; /* N.B. int32, not int! (Also above fn.) */
+  assert (src && dest);
+
+  while (len--) d = src[len] * N, dest[len] =
+    d >= N - 1? (int)(N - 1) : d < -N? (int)(-N) : rint32(d);
+}
diff --git a/src/soxr-lsr.h b/src/soxr-lsr.h
new file mode 100644
index 0000000..b1cc247
--- /dev/null
+++ b/src/soxr-lsr.h
@@ -0,0 +1,78 @@
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+/* Wrapper compatible with `libsamplerate' (constant-rate).
+ * (Libsoxr's native API can be found in soxr.h).  */
+
+#if !defined SAMPLERATE_H
+#define SAMPLERATE_H
+#if defined __cplusplus
+  extern "C" {
+#endif
+
+#if defined SOXR_DLL
+  #if defined soxr_lsr_EXPORTS
+    #define SOXR __declspec(dllexport)
+  #else
+    #define SOXR __declspec(dllimport)
+  #endif
+#elif defined SOXR_VISIBILITY && defined __GNUC__ && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+  #define SOXR __attribute__ ((visibility("default")))
+#else
+  #define SOXR
+#endif
+
+typedef float   SRC_SAMPLE;
+enum SRC_SRCTYPE_e {SRC_SINC_BEST_QUALITY, SRC_SINC_MEDIUM_QUALITY,
+                    SRC_SINC_FASTEST, SRC_ZERO_ORDER_HOLD, SRC_LINEAR};
+typedef int     SRC_SRCTYPE;
+typedef int     SRC_ERROR;
+typedef long    (* src_callback_t)(void *, SRC_SAMPLE * *);
+typedef struct  soxr SRC_STATE;
+typedef struct  SRC_DATA {
+  SRC_SAMPLE    * data_in, * data_out;
+  long          input_frames, output_frames;
+  long          input_frames_used, output_frames_gen;
+  int           end_of_input;
+  double        src_ratio;
+} SRC_DATA;
+SOXR SRC_STATE *   src_new(SRC_SRCTYPE, int num_channels, SRC_ERROR *);
+SOXR SRC_ERROR     src_process  (SRC_STATE *, SRC_DATA *);
+SOXR SRC_ERROR     src_set_ratio(SRC_STATE *, double);
+SOXR SRC_ERROR     src_reset    (SRC_STATE *);
+SOXR SRC_ERROR     src_error    (SRC_STATE *);
+SOXR SRC_STATE *   src_delete   (SRC_STATE *);
+SOXR SRC_STATE *   src_callback_new(
+                    src_callback_t, SRC_SRCTYPE, int, SRC_ERROR *, void *);
+SOXR long          src_callback_read(
+                    SRC_STATE *, double src_ratio, long, SRC_SAMPLE *);
+SOXR SRC_ERROR     src_simple(SRC_DATA *, SRC_SRCTYPE, int);
+SOXR char const *  src_get_name(SRC_SRCTYPE);
+SOXR char const *  src_get_description(SRC_SRCTYPE);
+SOXR char const *  src_get_version(void);
+SOXR char const *  src_strerror(SRC_ERROR);
+SOXR int           src_is_valid_ratio(double);
+SOXR void          src_short_to_float_array(short const *, float *, int);
+SOXR void          src_float_to_short_array(float const *, short *, int);
+SOXR void          src_int_to_float_array(int const *, float *, int);
+SOXR void          src_float_to_int_array(float const *, int *, int);
+
+#undef SOXR
+#if defined __cplusplus
+  }
+#endif
+#endif
diff --git a/src/soxr-lsr.pc.in b/src/soxr-lsr.pc.in
new file mode 100644
index 0000000..7b75757
--- /dev/null
+++ b/src/soxr-lsr.pc.in
@@ -0,0 +1,5 @@
+Name: ${LSR}
+Description: ${DESCRIPTION_SUMMARY} (with libsamplerate-like bindings)
+Version: ${PROJECT_VERSION}
+Libs: -L${LIB_INSTALL_DIR} -l${LSR}
+Cflags: -I${INCLUDE_INSTALL_DIR}
diff --git a/src/soxr.c b/src/soxr.c
new file mode 100644
index 0000000..c2861ac
--- /dev/null
+++ b/src/soxr.c
@@ -0,0 +1,842 @@
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <math.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#include "soxr.h"
+#include "data-io.h"
+#include "internal.h"
+
+#if AVUTIL_FOUND
+  #include <libavutil/cpu.h>
+#endif
+
+
+
+#if WITH_DEV_TRACE
+
+#include <stdarg.h>
+#include <stdio.h>
+
+int _soxr_trace_level;
+
+void _soxr_trace(char const * fmt, ...)
+{
+  va_list args;
+  va_start(args, fmt);
+  vfprintf(stderr, fmt, args);
+  fputc('\n', stderr);
+  va_end(args);
+}
+
+#endif
+
+
+
+char const * soxr_version(void)
+{
+  return "libsoxr-" SOXR_THIS_VERSION_STR;
+}
+
+
+
+
+typedef void sample_t; /* float or double */
+typedef void (* fn_t)(void);
+typedef fn_t control_block_t[10];
+
+#define resampler_input        (*(sample_t * (*)(void *, sample_t * samples, size_t   n))p->control_block[0])
+#define resampler_process      (*(void (*)(void *, size_t))p->control_block[1])
+#define resampler_output       (*(sample_t const * (*)(void *, sample_t * samples, size_t * n))p->control_block[2])
+#define resampler_flush        (*(void (*)(void *))p->control_block[3])
+#define resampler_close        (*(void (*)(void *))p->control_block[4])
+#define resampler_delay        (*(double (*)(void *))p->control_block[5])
+#define resampler_sizes        (*(void (*)(size_t * shared, size_t * channel))p->control_block[6])
+#define resampler_create       (*(char const * (*)(void * channel, void * shared, double io_ratio, soxr_quality_spec_t * q_spec, soxr_runtime_spec_t * r_spec, double scale))p->control_block[7])
+#define resampler_set_io_ratio (*(void (*)(void *, double io_ratio, size_t len))p->control_block[8])
+#define resampler_id           (*(char const * (*)(void))p->control_block[9])
+
+typedef void * resampler_t; /* For one channel. */
+typedef void * resampler_shared_t; /* Between channels. */
+typedef void (* deinterleave_t)(sample_t * * dest,
+    soxr_datatype_t data_type, void const * * src0, size_t n, unsigned ch);
+typedef size_t (* interleave_t)(soxr_datatype_t data_type, void * * dest,
+    sample_t const * const * src, size_t, unsigned, unsigned long *);
+
+struct soxr {
+  unsigned num_channels;
+  double io_ratio;
+  soxr_error_t error;
+  soxr_quality_spec_t q_spec;
+  soxr_io_spec_t io_spec;
+  soxr_runtime_spec_t runtime_spec;
+
+  void * input_fn_state;
+  soxr_input_fn_t input_fn;
+  size_t max_ilen;
+
+  resampler_shared_t shared;
+  resampler_t * resamplers;
+  control_block_t control_block;
+  deinterleave_t deinterleave;
+  interleave_t interleave;
+
+  void * * channel_ptrs;
+  size_t clips;
+  unsigned long seed;
+  int flushing;
+};
+
+
+
+#if WITH_CR32 || WITH_CR32S || WITH_CR64 || WITH_CR64S
+  #include "filter.h"
+#else
+  #define lsx_to_3dB(x) ((x)/(x))
+#endif
+
+
+
+soxr_quality_spec_t soxr_quality_spec(unsigned long recipe, unsigned long flags)
+{
+  soxr_quality_spec_t spec, * p = &spec;
+  unsigned q = recipe & 0xf;                         /* TODO: move to soxr-lsr.c: */
+  unsigned quality = q > SOXR_LSR2Q+2? SOXR_VHQ : q > SOXR_LSR2Q? SOXR_QQ : q;
+  double rej;
+  memset(p, 0, sizeof(*p));
+  if (quality > SOXR_PRECISIONQ) {
+    p->e = "invalid quality type";
+    return spec;
+  }
+  flags |= quality < SOXR_LSR0Q ? RESET_ON_CLEAR : 0;
+  p->phase_response = "\62\31\144"[(recipe & 0x30)>>4];
+  p->stopband_begin = 1;
+  p->precision =
+    quality == SOXR_QQ      ?  0 :
+    quality <= SOXR_16_BITQ ? 16 :
+    quality <= SOXR_32_BITQ ?  4 + quality * 4 :
+    quality <= SOXR_LSR2Q   ? 55 - quality * 4 : /* TODO: move to soxr-lsr.c */
+    0;
+  rej = p->precision * linear_to_dB(2.);
+  p->flags = flags;
+  if (quality <= SOXR_32_BITQ || quality == SOXR_PRECISIONQ) {
+    #define LOW_Q_BW0     (1385 / 2048.) /* 0.67625 rounded to be a FP exact. */
+    p->passband_end = quality == 1? LOW_Q_BW0 : 1 - .05 / lsx_to_3dB(rej);
+    if (quality <= 2)
+      p->flags &= ~SOXR_ROLLOFF_NONE, p->flags |= SOXR_ROLLOFF_MEDIUM;
+  }
+  else { /* TODO: move to soxr-lsr.c */
+    static float const bw[] = {.931f, .832f, .663f};
+    p->passband_end = bw[quality - SOXR_LSR0Q];
+    if (quality == SOXR_LSR2Q) {
+      p->flags &= ~SOXR_ROLLOFF_NONE;
+      p->flags |= SOXR_ROLLOFF_LSR2Q | SOXR_PROMOTE_TO_LQ;
+    }
+  }
+  if (recipe & SOXR_STEEP_FILTER)
+    p->passband_end = 1 - .01 / lsx_to_3dB(rej);
+  return spec;
+}
+
+
+
+char const * soxr_engine(soxr_t p)
+{
+  return resampler_id();
+}
+
+
+
+size_t * soxr_num_clips(soxr_t p)
+{
+  return &p->clips;
+}
+
+
+
+soxr_error_t soxr_error(soxr_t p)
+{
+  return p->error;
+}
+
+
+
+soxr_runtime_spec_t soxr_runtime_spec(unsigned num_threads)
+{
+  soxr_runtime_spec_t spec, * p = &spec;
+  memset(p, 0, sizeof(*p));
+  p->log2_min_dft_size = 10;
+  p->log2_large_dft_size = 17;
+  p->coef_size_kbytes = 400;
+  p->num_threads = num_threads;
+  return spec;
+}
+
+
+
+soxr_io_spec_t soxr_io_spec(
+  soxr_datatype_t itype,
+  soxr_datatype_t otype)
+{
+  soxr_io_spec_t spec, * p = &spec;
+  memset(p, 0, sizeof(*p));
+  if ((itype | otype) >= SOXR_SPLIT * 2)
+    p->e = "invalid io datatype(s)";
+  else {
+    p->itype = itype;
+    p->otype = otype;
+    p->scale = 1;
+  }
+  return spec;
+}
+
+
+
+#if (WITH_CR32S && WITH_CR32) || (WITH_CR64S && WITH_CR64)
+  #if defined __GNUC__ && defined __x86_64__
+    #define CPUID(type, eax_, ebx_, ecx_, edx_) \
+      __asm__ __volatile__ ( \
+        "cpuid \n\t" \
+        : "=a" (eax_), "=b" (ebx_), "=c" (ecx_), "=d" (edx_) \
+        : "a" (type), "c" (0));
+  #elif defined __GNUC__ && defined __i386__
+    #define CPUID(type, eax_, ebx_, ecx_, edx_) \
+      __asm__ __volatile__ ( \
+        "mov %%ebx, %%edi \n\t" \
+        "cpuid \n\t" \
+        "xchg %%edi, %%ebx \n\t" \
+        : "=a" (eax_), "=D" (ebx_), "=c" (ecx_), "=d" (edx_) \
+        : "a" (type), "c" (0));
+  #elif defined _M_X64 && defined _MSC_VER && _MSC_VER > 1500
+     void __cpuidex(int CPUInfo[4], int info_type, int ecxvalue);
+     #pragma intrinsic(__cpuidex)
+     #define CPUID(type, eax_, ebx_, ecx_, edx_) do { \
+       int regs[4]; \
+       __cpuidex(regs, type, 0); \
+       eax_ = regs[0], ebx_ = regs[1], ecx_ = regs[2], edx_ = regs[3]; \
+     } while(0)
+  #elif defined _M_X64 && defined _MSC_VER
+     void __cpuidex(int CPUInfo[4], int info_type);
+     #pragma intrinsic(__cpuidex)
+     #define CPUID(type, eax_, ebx_, ecx_, edx_) do { \
+       int regs[4]; \
+       __cpuidex(regs, type); \
+       eax_ = regs[0], ebx_ = regs[1], ecx_ = regs[2], edx_ = regs[3]; \
+     } while(0)
+  #elif defined _M_IX86 && defined _MSC_VER
+    #define CPUID(type, eax_, ebx_, ecx_, edx_) \
+      __asm pushad \
+      __asm mov eax, type \
+      __asm xor ecx, ecx \
+      __asm cpuid \
+      __asm mov eax_, eax \
+      __asm mov ebx_, ebx \
+      __asm mov ecx_, ecx \
+      __asm mov edx_, edx \
+      __asm popad
+  #endif
+#endif
+
+
+
+#if WITH_CR32S && WITH_CR32
+  static bool cpu_has_simd32(void)
+  {
+  #if defined __x86_64__ || defined _M_X64
+    return true;
+  #elif defined __i386__ || defined _M_IX86
+    enum {SSE = 1 << 25, SSE2 = 1 << 26};
+    unsigned eax_, ebx_, ecx_, edx_;
+    CPUID(1, eax_, ebx_, ecx_, edx_);
+    return (edx_ & (SSE|SSE2)) != 0;
+  #elif defined AV_CPU_FLAG_NEON
+    return !!(av_get_cpu_flags() & AV_CPU_FLAG_NEON);
+  #else
+    return false;
+  #endif
+  }
+
+  static bool should_use_simd32(void)
+  {
+    char const * e;
+    return ((e = getenv("SOXR_USE_SIMD"  )))? !!atoi(e) :
+           ((e = getenv("SOXR_USE_SIMD32")))? !!atoi(e) : cpu_has_simd32();
+  }
+#else
+  #define should_use_simd32() true
+#endif
+
+
+
+#if WITH_CR64S && WITH_CR64
+  #if defined __GNUC__
+    #define XGETBV(type, eax_, edx_) \
+      __asm__ __volatile__ ( \
+        ".byte 0x0f, 0x01, 0xd0\n" \
+        : "=a"(eax_), "=d"(edx_) : "c" (type));
+  #elif defined _M_X64 && defined _MSC_FULL_VER && _MSC_FULL_VER >= 160040219
+    #include <immintrin.h>
+    #define XGETBV(type, eax_, edx_) do { \
+      union {uint64_t x; uint32_t y[2];} a = {_xgetbv(0)}; \
+      eax_ = a.y[0], edx_ = a.y[1]; \
+     } while(0)
+  #elif defined _M_IX86 && defined _MSC_VER
+    #define XGETBV(type, eax_, edx_) \
+      __asm pushad \
+      __asm mov ecx, type \
+      __asm _emit 0x0f \
+      __asm _emit 0x01 \
+      __asm _emit 0xd0 \
+      __asm mov eax_, eax \
+      __asm mov edx_, edx \
+      __asm popad
+  #else
+    #define XGETBV(type, eax_, edx_) eax_ = edx_ = 0
+  #endif
+
+  static bool cpu_has_simd64(void)
+  {
+    enum {OSXSAVE = 1 << 27, AVX = 1 << 28};
+    unsigned eax_, ebx_, ecx_, edx_;
+    CPUID(1, eax_, ebx_, ecx_, edx_);
+    if ((ecx_ & (OSXSAVE|AVX)) == (OSXSAVE|AVX)) {
+      XGETBV(0, eax_, edx_);
+      return (eax_ & 6) == 6;
+    }
+    return false;
+  }
+
+  static bool should_use_simd64(void)
+  {
+    char const * e;
+    return ((e = getenv("SOXR_USE_SIMD"  )))? !!atoi(e) :
+           ((e = getenv("SOXR_USE_SIMD64")))? !!atoi(e) : cpu_has_simd64();
+  }
+#else
+  #define should_use_simd64() true
+#endif
+
+
+
+extern control_block_t
+  _soxr_rate32_cb,
+  _soxr_rate32s_cb,
+  _soxr_rate64_cb,
+  _soxr_rate64s_cb,
+  _soxr_vr32_cb;
+
+
+
+static void runtime_num(char const * env_name,
+    int min, int max, unsigned * field)
+{
+  char const * e = getenv(env_name);
+  if (e) {
+    int i = atoi(e);
+    if (i >= min && i <= max)
+      *field = (unsigned)i;
+  }
+}
+
+
+
+static void runtime_flag(char const * env_name,
+    unsigned n_bits, unsigned n_shift, unsigned long * flags)
+{
+  char const * e = getenv(env_name);
+  if (e) {
+    int i = atoi(e);
+    unsigned long mask = (1UL << n_bits) - 1;
+    if (i >= 0 && i <= (int)mask)
+      *flags &= ~(mask << n_shift), *flags |= ((unsigned long)i << n_shift);
+  }
+}
+
+
+
+soxr_t soxr_create(
+  double input_rate, double output_rate,
+  unsigned num_channels,
+  soxr_error_t * error0,
+  soxr_io_spec_t const * io_spec,
+  soxr_quality_spec_t const * q_spec,
+  soxr_runtime_spec_t const * runtime_spec)
+{
+  double io_ratio = output_rate!=0? input_rate!=0?
+    input_rate / output_rate : -1 : input_rate!=0? -1 : 0;
+  static const float datatype_full_scale[] = {1, 1, 65536.*32768, 32768};
+  soxr_t p = 0;
+  soxr_error_t error = 0;
+
+#if WITH_DEV_TRACE
+#define _(x) (char)(sizeof(x)>=10? 'a'+(char)(sizeof(x)-10):'0'+(char)sizeof(x))
+  char const * e = getenv("SOXR_TRACE");
+  _soxr_trace_level = e? atoi(e) : 0;
+  {
+    static char const arch[] = {_(char), _(short), _(int), _(long), _(long long)
+      , ' ', _(float), _(double), _(long double)
+      , ' ', _(int *), _(int (*)(int))
+      , ' ', HAVE_BIGENDIAN ? 'B' : 'L'
+#if defined _OPENMP
+      , ' ', 'O', 'M', 'P'
+#endif
+      , 0};
+#undef _
+    lsx_debug("arch: %s", arch);
+  }
+#endif
+
+  if (q_spec && q_spec->e)  error = q_spec->e;
+  else if (io_spec && (io_spec->itype | io_spec->otype) >= SOXR_SPLIT * 2)
+    error = "invalid io datatype(s)";
+
+  if (!error && !(p = calloc(sizeof(*p), 1))) error = "malloc failed";
+
+  if (p) {
+    control_block_t * control_block;
+
+    p->q_spec = q_spec? *q_spec : soxr_quality_spec(SOXR_HQ, 0);
+
+    if (q_spec) { /* Backwards compatibility with original API: */
+      if (p->q_spec.passband_end > 2)
+        p->q_spec.passband_end /= 100;
+      if (p->q_spec.stopband_begin > 2)
+        p->q_spec.stopband_begin = 2 - p->q_spec.stopband_begin / 100;
+    }
+
+    p->io_ratio = io_ratio;
+    p->num_channels = num_channels;
+    if (io_spec)
+      p->io_spec = *io_spec;
+    else
+      p->io_spec.scale = 1;
+
+    p->runtime_spec = runtime_spec? *runtime_spec : soxr_runtime_spec(1);
+
+    runtime_num("SOXR_MIN_DFT_SIZE", 8, 15, &p->runtime_spec.log2_min_dft_size);
+    runtime_num("SOXR_LARGE_DFT_SIZE", 8, 20, &p->runtime_spec.log2_large_dft_size);
+    runtime_num("SOXR_COEFS_SIZE", 100, 800, &p->runtime_spec.coef_size_kbytes);
+    runtime_num("SOXR_NUM_THREADS", 0, 64, &p->runtime_spec.num_threads);
+    runtime_flag("SOXR_COEF_INTERP", 2, 0, &p->runtime_spec.flags);
+
+    runtime_flag("SOXR_STRICT_BUF", 1, 2, &p->runtime_spec.flags);
+    runtime_flag("SOXR_NOSMALLINTOPT", 1, 3, &p->runtime_spec.flags);
+
+    p->io_spec.scale *= datatype_full_scale[p->io_spec.otype & 3] /
+                        datatype_full_scale[p->io_spec.itype & 3];
+
+    p->seed = (unsigned long)time(0) ^ (unsigned long)(size_t)p;
+
+#if WITH_CR32 || WITH_CR32S || WITH_VR32
+    if (0
+#if WITH_VR32
+        || ((!WITH_CR32 && !WITH_CR32S) || (p->q_spec.flags & SOXR_VR))
+#endif
+#if WITH_CR32 || WITH_CR32S
+        || !(WITH_CR64 || WITH_CR64S) || (p->q_spec.precision <= 20 && !(p->q_spec.flags & SOXR_DOUBLE_PRECISION))
+#endif
+        ) {
+      p->deinterleave = (deinterleave_t)_soxr_deinterleave_f;
+      p->interleave = (interleave_t)_soxr_interleave_f;
+      control_block =
+#if WITH_VR32
+          ((!WITH_CR32 && !WITH_CR32S) || (p->q_spec.flags & SOXR_VR))? &_soxr_vr32_cb :
+#endif
+#if WITH_CR32S
+          !WITH_CR32 || should_use_simd32()? &_soxr_rate32s_cb :
+#endif
+          &_soxr_rate32_cb;
+    }
+#if WITH_CR64 || WITH_CR64S
+    else
+#endif
+#endif
+#if WITH_CR64 || WITH_CR64S
+    {
+      p->deinterleave = (deinterleave_t)_soxr_deinterleave;
+      p->interleave = (interleave_t)_soxr_interleave;
+      control_block =
+#if WITH_CR64S
+          !WITH_CR64 || should_use_simd64()? &_soxr_rate64s_cb :
+#endif
+          &_soxr_rate64_cb;
+    }
+#endif
+    memcpy(&p->control_block, control_block, sizeof(p->control_block));
+
+    if (p->num_channels && io_ratio!=0)
+      error = soxr_set_io_ratio(p, io_ratio, 0);
+  }
+  if (error)
+    soxr_delete(p), p = 0;
+  if (error0)
+    *error0 = error;
+  return p;
+}
+
+
+
+soxr_error_t soxr_set_input_fn(soxr_t p,
+    soxr_input_fn_t input_fn, void * input_fn_state, size_t max_ilen)
+{
+  p->input_fn_state = input_fn_state;
+  p->input_fn = input_fn;
+  p->max_ilen = max_ilen? max_ilen : (size_t)-1;
+  return 0;
+}
+
+
+
+static void soxr_delete0(soxr_t p)
+{
+  unsigned i;
+
+  if (p->resamplers) for (i = 0; i < p->num_channels; ++i) {
+    if (p->resamplers[i])
+      resampler_close(p->resamplers[i]);
+    free(p->resamplers[i]);
+  }
+  free(p->resamplers);
+  free(p->channel_ptrs);
+  free(p->shared);
+
+  memset(p, 0, sizeof(*p));
+}
+
+
+
+double soxr_delay(soxr_t p)
+{
+  return
+    (p && !p->error && p->resamplers)? resampler_delay(p->resamplers[0]) : 0;
+}
+
+
+
+static soxr_error_t fatal_error(soxr_t p, soxr_error_t error)
+{
+  soxr_delete0(p);
+  return p->error = error;
+}
+
+
+
+static soxr_error_t initialise(soxr_t p)
+{
+  unsigned i;
+  size_t shared_size, channel_size;
+
+  resampler_sizes(&shared_size, &channel_size);
+  p->channel_ptrs = calloc(sizeof(*p->channel_ptrs), p->num_channels);
+  p->shared = calloc(shared_size, 1);
+  p->resamplers = calloc(sizeof(*p->resamplers), p->num_channels);
+  if (!p->shared || !p->channel_ptrs || !p->resamplers)
+    return fatal_error(p, "malloc failed");
+
+  for (i = 0; i < p->num_channels; ++i) {
+    soxr_error_t error;
+    if (!(p->resamplers[i] = calloc(channel_size, 1)))
+      return fatal_error(p, "malloc failed");
+    error = resampler_create(
+        p->resamplers[i],
+        p->shared,
+        p->io_ratio,
+        &p->q_spec,
+        &p->runtime_spec,
+        p->io_spec.scale);
+    if (error)
+      return fatal_error(p, error);
+  }
+  return 0;
+}
+
+
+
+soxr_error_t soxr_set_num_channels(soxr_t p, unsigned num_channels)
+{
+  if (!p)                return "invalid soxr_t pointer";
+  if (num_channels == p->num_channels) return p->error;
+  if (!num_channels)     return "invalid # of channels";
+  if (p->resamplers)     return "# of channels can't be changed";
+  p->num_channels = num_channels;
+  return soxr_set_io_ratio(p, p->io_ratio, 0);
+}
+
+
+
+soxr_error_t soxr_set_io_ratio(soxr_t p, double io_ratio, size_t slew_len)
+{
+  unsigned i;
+  soxr_error_t error;
+  if (!p)                 return "invalid soxr_t pointer";
+  if ((error = p->error)) return error;
+  if (!p->num_channels)   return "must set # channels before O/I ratio";
+  if (io_ratio <= 0)      return "I/O ratio out-of-range";
+  if (!p->channel_ptrs) {
+    p->io_ratio = io_ratio;
+    return initialise(p);
+  }
+  if (p->control_block[8]) {
+    for (i = 0; !error && i < p->num_channels; ++i)
+      resampler_set_io_ratio(p->resamplers[i], io_ratio, slew_len);
+    return error;
+  }
+  return fabs(p->io_ratio - io_ratio) < 1e-15? 0 :
+    "varying O/I ratio is not supported with this quality level";
+}
+
+
+
+void soxr_delete(soxr_t p)
+{
+  if (p)
+    soxr_delete0(p), free(p);
+}
+
+
+
+soxr_error_t soxr_clear(soxr_t p) /* TODO: this, properly. */
+{
+  if (p) {
+    struct soxr tmp = *p;
+    soxr_delete0(p);
+    memset(p, 0, sizeof(*p));
+    p->input_fn = tmp.input_fn;
+    p->runtime_spec = tmp.runtime_spec;
+    p->q_spec = tmp.q_spec;
+    p->io_spec = tmp.io_spec;
+    p->num_channels = tmp.num_channels;
+    p->input_fn_state = tmp.input_fn_state;
+    memcpy(p->control_block, tmp.control_block, sizeof(p->control_block));
+    p->deinterleave = tmp.deinterleave;
+    p->interleave = tmp.interleave;
+    return (p->q_spec.flags & RESET_ON_CLEAR)?
+      soxr_set_io_ratio(p, tmp.io_ratio, 0) : 0;
+  }
+  return "invalid soxr_t pointer";
+}
+
+
+
+static void soxr_input_1ch(soxr_t p, unsigned i, soxr_cbuf_t src, size_t len)
+{
+  sample_t * dest = resampler_input(p->resamplers[i], NULL, len);
+  (*p->deinterleave)(&dest, p->io_spec.itype, &src, len, 1);
+}
+
+
+
+static size_t soxr_input(soxr_t p, void const * in, size_t len)
+{
+  bool separated = !!(p->io_spec.itype & SOXR_SPLIT);
+  unsigned i;
+  if (!p || p->error) return 0;
+  if (!in && len) {p->error = "null input buffer pointer"; return 0;}
+  if (!len) {
+    p->flushing = true;
+    return 0;
+  }
+  if (separated)
+    for (i = 0; i < p->num_channels; ++i)
+      soxr_input_1ch(p, i, ((soxr_cbufs_t)in)[i], len);
+  else {
+    for (i = 0; i < p->num_channels; ++i)
+      p->channel_ptrs[i] = resampler_input(p->resamplers[i], NULL, len);
+    (*p->deinterleave)(
+        (sample_t **)p->channel_ptrs, p->io_spec.itype, &in, len, p->num_channels);
+  }
+  return len;
+}
+
+
+
+static size_t soxr_output_1ch(soxr_t p, unsigned i, soxr_buf_t dest, size_t len, bool separated)
+{
+  sample_t const * src;
+  if (p->flushing)
+    resampler_flush(p->resamplers[i]);
+  resampler_process(p->resamplers[i], len);
+  src = resampler_output(p->resamplers[i], NULL, &len);
+  if (separated)
+    p->clips += (p->interleave)(p->io_spec.otype, &dest, &src,
+      len, 1, (p->io_spec.flags & SOXR_NO_DITHER)? 0 : &p->seed);
+  else p->channel_ptrs[i] = (void /* const */ *)src;
+  return len;
+}
+
+
+
+static size_t soxr_output_no_callback(soxr_t p, soxr_buf_t out, size_t len)
+{
+  unsigned u;
+  size_t done = 0;
+  bool separated = !!(p->io_spec.otype & SOXR_SPLIT);
+#if defined _OPENMP
+  int i;
+  if (!p->runtime_spec.num_threads && p->num_channels > 1)
+#pragma omp parallel for
+  for (i = 0; i < (int)p->num_channels; ++i) {
+    size_t done1;
+    done1 = soxr_output_1ch(p, (unsigned)i, ((soxr_bufs_t)out)[i], len, separated);
+    if (!i)
+      done = done1;
+  } else
+#endif
+  for (u = 0; u < p->num_channels; ++u)
+    done = soxr_output_1ch(p, u, ((soxr_bufs_t)out)[u], len, separated);
+
+  if (!separated)
+    p->clips += (p->interleave)(p->io_spec.otype, &out, (sample_t const * const *)p->channel_ptrs,
+        done, p->num_channels, (p->io_spec.flags & SOXR_NO_DITHER)? 0 : &p->seed);
+  return done;
+}
+
+
+
+size_t soxr_output(soxr_t p, void * out, size_t len0)
+{
+  size_t odone, odone0 = 0, olen = len0, osize, idone;
+  size_t ilen = min(p->max_ilen, (size_t)ceil((double)olen *p->io_ratio));
+  void const * in = out; /* Set to !=0, so that caller may leave unset. */
+  bool was_flushing;
+
+  if (!p || p->error) return 0;
+  if (!out && len0) {p->error = "null output buffer pointer"; return 0;}
+
+  do {
+    odone = soxr_output_no_callback(p, out, olen);
+    odone0 += odone;
+    if (odone0 == len0 || !p->input_fn || p->flushing)
+      break;
+
+    osize = soxr_datatype_size(p->io_spec.otype) * p->num_channels;
+    out = (char *)out + osize * odone;
+    olen -= odone;
+    idone = p->input_fn(p->input_fn_state, &in, ilen);
+    was_flushing = p->flushing;
+    if (!in)
+      p->error = "input function reported failure";
+    else soxr_input(p, in, idone);
+  } while (odone || idone || (!was_flushing && p->flushing));
+  return odone0;
+}
+
+
+
+static size_t soxr_i_for_o(soxr_t p, size_t olen, size_t ilen)
+{
+  size_t result;
+#if 0
+  if (p->runtime_spec.flags & SOXR_STRICT_BUFFERING)
+    result = rate_i_for_o(p->resamplers[0], olen);
+  else
+#endif
+    result = (size_t)ceil((double)olen * p->io_ratio);
+  return min(result, ilen);
+}
+
+
+
+#if 0
+static size_t soxr_o_for_i(soxr_t p, size_t ilen, size_t olen)
+{
+  size_t result = (size_t)ceil((double)ilen / p->io_ratio);
+  return min(result, olen);
+}
+#endif
+
+
+
+soxr_error_t soxr_process(soxr_t p,
+    void const * in , size_t ilen0, size_t * idone0,
+    void       * out, size_t olen , size_t * odone0)
+{
+  size_t ilen, idone, odone = 0;
+  unsigned u;
+  bool flush_requested = false;
+
+  if (!p) return "null pointer";
+
+  if (!in)
+    flush_requested = true, ilen = ilen0 = 0;
+  else {
+    if ((ptrdiff_t)ilen0 < 0)
+      flush_requested = true, ilen0 = ~ilen0;
+    if (idone0 && (1 || flush_requested))
+      ilen = soxr_i_for_o(p, olen, ilen0);
+    else
+      ilen = ilen0/*, olen = soxr_o_for_i(p, ilen, olen)*/;
+  }
+  p->flushing |= ilen == ilen0 && flush_requested;
+
+  if (!out && !in)
+    idone = ilen;
+  else if (p->io_spec.itype & p->io_spec.otype & SOXR_SPLIT) { /* Both i & o */
+#if defined _OPENMP
+    int i;
+    if (!p->runtime_spec.num_threads && p->num_channels > 1)
+#pragma omp parallel for
+    for (i = 0; i < (int)p->num_channels; ++i) {
+      size_t done;
+      if (in)
+        soxr_input_1ch(p, (unsigned)i, ((soxr_cbufs_t)in)[i], ilen);
+      done = soxr_output_1ch(p, (unsigned)i, ((soxr_bufs_t)out)[i], olen, true);
+      if (!i)
+        odone = done;
+    } else
+#endif
+    for (u = 0; u < p->num_channels; ++u) {
+      if (in)
+        soxr_input_1ch(p, u, ((soxr_cbufs_t)in)[u], ilen);
+      odone = soxr_output_1ch(p, u, ((soxr_bufs_t)out)[u], olen, true);
+    }
+    idone = ilen;
+  }
+  else {
+    idone = ilen? soxr_input (p, in , ilen) : 0;
+    odone = soxr_output(p, out, olen);
+  }
+  if (idone0) *idone0 = idone;
+  if (odone0) *odone0 = odone;
+  return p->error;
+}
+
+
+
+soxr_error_t soxr_oneshot(
+    double irate, double orate,
+    unsigned num_channels,
+    void const * in , size_t ilen, size_t * idone,
+    void * out, size_t olen, size_t * odone,
+    soxr_io_spec_t const * io_spec,
+    soxr_quality_spec_t const * q_spec,
+    soxr_runtime_spec_t const * runtime_spec)
+{
+  soxr_t resampler;
+  soxr_error_t error = q_spec? q_spec->e : 0;
+  if (!error) {
+    soxr_quality_spec_t q_spec1;
+    if (!q_spec)
+      q_spec1 = soxr_quality_spec(SOXR_LQ, 0), q_spec = &q_spec1;
+    resampler = soxr_create(irate, orate, num_channels,
+        &error, io_spec, q_spec, runtime_spec);
+  }
+  if (!error) {
+    error = soxr_process(resampler, in, ~ilen, idone, out, olen, odone);
+    soxr_delete(resampler);
+  }
+  return error;
+}
+
+
+
+soxr_error_t soxr_set_error(soxr_t p, soxr_error_t error)
+{
+  if (!p) return "null pointer";
+  if (!p->error && p->error != error) return p->error;
+  p->error = error;
+  return 0;
+}
diff --git a/src/soxr.h b/src/soxr.h
new file mode 100644
index 0000000..09ec7c4
--- /dev/null
+++ b/src/soxr.h
@@ -0,0 +1,344 @@
+/* SoX Resampler Library      Copyright (c) 2007-18 robs@users.sourceforge.net
+ *
+ * This library is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU Lesser General Public License as published by
+ * the Free Software Foundation; either version 2.1 of the License, or (at
+ * your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this library; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+
+
+/* -------------------------------- Gubbins --------------------------------- */
+
+#if !defined soxr_included
+#define soxr_included
+
+
+#if defined __cplusplus
+  #include <cstddef>
+  extern "C" {
+#else
+  #include <stddef.h>
+#endif
+
+#if defined SOXR_DLL
+  #if defined soxr_EXPORTS
+    #define SOXR __declspec(dllexport)
+  #else
+    #define SOXR __declspec(dllimport)
+  #endif
+#elif defined SOXR_VISIBILITY && defined __GNUC__ && (__GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 1)
+  #define SOXR __attribute__ ((visibility("default")))
+#else
+  #define SOXR
+#endif
+
+typedef struct soxr_io_spec soxr_io_spec_t;
+typedef struct soxr_quality_spec soxr_quality_spec_t;
+typedef struct soxr_runtime_spec soxr_runtime_spec_t;
+
+
+
+/* ---------------------------- API conventions --------------------------------
+
+Buffer lengths (and occupancies) are expressed as the number of contained
+samples per channel.
+
+Parameter names for buffer lengths have the suffix `len'.
+
+A single-character `i' or 'o' is often used in names to give context as
+input or output (e.g. ilen, olen).                                            */
+
+
+
+/* --------------------------- Version management --------------------------- */
+
+/* E.g. #if SOXR_THIS_VERSION >= SOXR_VERSION(0,1,1) ...                      */
+
+#define SOXR_VERSION(x,y,z)     (((x)<<16)|((y)<<8)|(z))
+#define SOXR_THIS_VERSION       SOXR_VERSION(0,1,3)
+#define SOXR_THIS_VERSION_STR               "0.1.3"
+
+
+
+/* --------------------------- Type declarations ---------------------------- */
+
+typedef struct soxr * soxr_t;          /* A resampler for 1 or more channels. */
+typedef char const * soxr_error_t;                /* 0:no-error; non-0:error. */
+
+typedef void       * soxr_buf_t;  /* 1 buffer of channel-interleaved samples. */
+typedef void const * soxr_cbuf_t;                        /* Ditto; read-only. */
+
+typedef soxr_buf_t const  * soxr_bufs_t;/* Or, a separate buffer for each ch. */
+typedef soxr_cbuf_t const * soxr_cbufs_t;                /* Ditto; read-only. */
+
+typedef void const * soxr_in_t;      /* Either a soxr_cbuf_t or soxr_cbufs_t,
+                                        depending on itype in soxr_io_spec_t. */
+typedef void       * soxr_out_t;     /* Either a soxr_buf_t or soxr_bufs_t,
+                                        depending on otype in soxr_io_spec_t. */
+
+
+
+/* --------------------------- API main functions --------------------------- */
+
+SOXR char const * soxr_version(void);  /* Query library version: "libsoxr-x.y.z" */
+
+#define soxr_strerror(e)               /* Soxr counterpart to strerror. */     \
+    ((e)?(e):"no error")
+
+
+/* Create a stream resampler: */
+
+SOXR soxr_t soxr_create(
+    double      input_rate,      /* Input sample-rate. */
+    double      output_rate,     /* Output sample-rate. */
+    unsigned    num_channels,    /* Number of channels to be used. */
+        /* All following arguments are optional (may be set to NULL). */
+    soxr_error_t *,              /* To report any error during creation. */
+    soxr_io_spec_t const *,      /* To specify non-default I/O formats. */
+    soxr_quality_spec_t const *, /* To specify non-default resampling quality.*/
+    soxr_runtime_spec_t const *);/* To specify non-default runtime resources.
+
+    Default io_spec      is per soxr_io_spec(SOXR_FLOAT32_I, SOXR_FLOAT32_I)
+    Default quality_spec is per soxr_quality_spec(SOXR_HQ, 0)
+    Default runtime_spec is per soxr_runtime_spec(1)                          */
+
+
+
+/* If not using an app-supplied input function, after creating a stream
+ * resampler, repeatedly call: */
+
+SOXR soxr_error_t soxr_process(
+    soxr_t      resampler,      /* As returned by soxr_create. */
+                            /* Input (to be resampled): */
+    soxr_in_t   in,             /* Input buffer(s); may be NULL (see below). */
+    size_t      ilen,           /* Input buf. length (samples per channel). */
+    size_t      * idone,        /* To return actual # samples used (<= ilen). */
+                            /* Output (resampled): */
+    soxr_out_t  out,            /* Output buffer(s).*/
+    size_t      olen,           /* Output buf. length (samples per channel). */
+    size_t      * odone);       /* To return actual # samples out (<= olen).
+
+    Note that no special meaning is associated with ilen or olen equal to
+    zero.  End-of-input (i.e. no data is available nor shall be available)
+    may be indicated by seting `in' to NULL.                                  */
+
+
+
+/* If using an app-supplied input function, it must look and behave like this:*/
+
+typedef size_t /* data_len */
+  (* soxr_input_fn_t)(         /* Supply data to be resampled. */
+    void * input_fn_state,     /* As given to soxr_set_input_fn (below). */
+    soxr_in_t * data,          /* Returned data; see below. N.B. ptr to ptr(s)*/
+    size_t requested_len);     /* Samples per channel, >= returned data_len.
+
+  data_len  *data     Indicates    Meaning
+   ------- -------   ------------  -------------------------
+     !=0     !=0       Success     *data contains data to be
+                                   input to the resampler.
+      0    !=0 (or   End-of-input  No data is available nor
+           not set)                shall be available.
+      0       0        Failure     An error occurred whilst trying to
+                                   source data to be input to the resampler.  */
+
+/* and be registered with a previously created stream resampler using: */
+
+SOXR soxr_error_t soxr_set_input_fn(/* Set (or reset) an input function.*/
+    soxr_t resampler,            /* As returned by soxr_create. */
+    soxr_input_fn_t,             /* Function to supply data to be resampled.*/
+    void * input_fn_state,       /* If needed by the input function. */
+    size_t max_ilen);            /* Maximum value for input fn. requested_len.*/
+
+/* then repeatedly call: */
+
+SOXR size_t /*odone*/ soxr_output(/* Resample and output a block of data.*/
+    soxr_t resampler,            /* As returned by soxr_create. */
+    soxr_out_t data,             /* App-supplied buffer(s) for resampled data.*/
+    size_t olen);                /* Amount of data to output; >= odone. */
+
+
+
+/* Common stream resampler operations: */
+
+SOXR soxr_error_t soxr_error(soxr_t);   /* Query error status. */
+SOXR size_t   * soxr_num_clips(soxr_t); /* Query int. clip counter (for R/W). */
+SOXR double     soxr_delay(soxr_t);  /* Query current delay in output samples.*/
+SOXR char const * soxr_engine(soxr_t);  /* Query resampling engine name. */
+
+SOXR soxr_error_t soxr_clear(soxr_t); /* Ready for fresh signal, same config. */
+SOXR void         soxr_delete(soxr_t);  /* Free resources. */
+
+
+
+/* `Short-cut', single call to resample a (probably short) signal held entirely
+ * in memory.  See soxr_create and soxr_process above for parameter details.
+ * Note that unlike soxr_create however, the default quality spec. for
+ * soxr_oneshot is per soxr_quality_spec(SOXR_LQ, 0). */
+
+SOXR soxr_error_t soxr_oneshot(
+    double         input_rate,
+    double         output_rate,
+    unsigned       num_channels,
+    soxr_in_t    in , size_t ilen, size_t * idone,
+    soxr_out_t   out, size_t olen, size_t * odone,
+    soxr_io_spec_t const *,
+    soxr_quality_spec_t const *,
+    soxr_runtime_spec_t const *);
+
+
+
+/* For variable-rate resampling. See example # 5 for how to create a
+ * variable-rate resampler and how to use this function. */
+
+SOXR soxr_error_t soxr_set_io_ratio(soxr_t, double io_ratio, size_t slew_len);
+
+
+
+/* -------------------------- API type definitions -------------------------- */
+
+typedef enum {          /* Datatypes supported for I/O to/from the resampler: */
+  /* Internal; do not use: */
+  SOXR_FLOAT32, SOXR_FLOAT64, SOXR_INT32, SOXR_INT16, SOXR_SPLIT = 4,
+
+  /* Use for interleaved channels: */
+  SOXR_FLOAT32_I = SOXR_FLOAT32, SOXR_FLOAT64_I, SOXR_INT32_I, SOXR_INT16_I,
+
+  /* Use for split channels: */
+  SOXR_FLOAT32_S = SOXR_SPLIT  , SOXR_FLOAT64_S, SOXR_INT32_S, SOXR_INT16_S
+
+} soxr_datatype_t;
+
+#define soxr_datatype_size(x)  /* Returns `sizeof' a soxr_datatype_t sample. */\
+  ((unsigned char *)"\4\10\4\2")[(x)&3]
+
+
+
+struct soxr_io_spec {                                            /* Typically */
+  soxr_datatype_t itype;     /* Input datatype.                SOXR_FLOAT32_I */
+  soxr_datatype_t otype;     /* Output datatype.               SOXR_FLOAT32_I */
+  double scale;              /* Linear gain to apply during resampling.  1    */
+  void * e;                  /* Reserved for internal use                0    */
+  unsigned long flags;       /* Per the following #defines.              0    */
+};
+
+#define SOXR_TPDF              0     /* Applicable only if otype is INT16. */
+#define SOXR_NO_DITHER         8u    /* Disable the above. */
+
+
+
+struct soxr_quality_spec {                                       /* Typically */
+  double precision;         /* Conversion precision (in bits).           20   */
+  double phase_response;    /* 0=minimum, ... 50=linear, ... 100=maximum 50   */
+  double passband_end;      /* 0dB pt. bandwidth to preserve; nyquist=1  0.913*/
+  double stopband_begin;    /* Aliasing/imaging control; > passband_end   1   */
+  void * e;                 /* Reserved for internal use.                 0   */
+  unsigned long flags;      /* Per the following #defines.                0   */
+};
+
+#define SOXR_ROLLOFF_SMALL     0u    /* <= 0.01 dB */
+#define SOXR_ROLLOFF_MEDIUM    1u    /* <= 0.35 dB */
+#define SOXR_ROLLOFF_NONE      2u    /* For Chebyshev bandwidth. */
+
+#define SOXR_HI_PREC_CLOCK     8u  /* Increase `irrational' ratio accuracy. */
+#define SOXR_DOUBLE_PRECISION 16u  /* Use D.P. calcs even if precision <= 20. */
+#define SOXR_VR               32u  /* Variable-rate resampling. */
+
+
+
+struct soxr_runtime_spec {                                       /* Typically */
+  unsigned log2_min_dft_size;   /* For DFT efficiency. [8,15]           10    */
+  unsigned log2_large_dft_size; /* For DFT efficiency. [8,20]           17    */
+  unsigned coef_size_kbytes;    /* For SOXR_COEF_INTERP_AUTO (below).   400   */
+  unsigned num_threads;         /* 0: per OMP_NUM_THREADS; 1: 1 thread.  1    */
+  void * e;                     /* Reserved for internal use.            0    */
+  unsigned long flags;          /* Per the following #defines.           0    */
+};
+                                   /* For `irrational' ratios only: */
+#define SOXR_COEF_INTERP_AUTO  0u    /* Auto select coef. interpolation. */
+#define SOXR_COEF_INTERP_LOW   2u    /* Man. select: less CPU, more memory. */
+#define SOXR_COEF_INTERP_HIGH  3u    /* Man. select: more CPU, less memory. */
+
+
+
+/* -------------------------- API type constructors ------------------------- */
+
+/* These functions allow setting of the most commonly-used structure
+ * parameters, with other parameters being given default values.  The default
+ * values may then be overridden, directly in the structure, if needed.  */
+
+SOXR soxr_quality_spec_t soxr_quality_spec(
+    unsigned long recipe,       /* Per the #defines immediately below. */
+    unsigned long flags);       /* As soxr_quality_spec_t.flags. */
+
+                                  /* The 5 standard qualities found in SoX: */
+#define SOXR_QQ                 0   /* 'Quick' cubic interpolation. */
+#define SOXR_LQ                 1   /* 'Low' 16-bit with larger rolloff. */
+#define SOXR_MQ                 2   /* 'Medium' 16-bit with medium rolloff. */
+#define SOXR_HQ                 SOXR_20_BITQ /* 'High quality'. */
+#define SOXR_VHQ                SOXR_28_BITQ /* 'Very high quality'. */
+
+#define SOXR_16_BITQ            3
+#define SOXR_20_BITQ            4
+#define SOXR_24_BITQ            5
+#define SOXR_28_BITQ            6
+#define SOXR_32_BITQ            7
+                                /* Reserved for internal use (to be removed): */
+#define SOXR_LSR0Q              8     /* 'Best sinc'. */
+#define SOXR_LSR1Q              9     /* 'Medium sinc'. */
+#define SOXR_LSR2Q              10    /* 'Fast sinc'. */
+
+#define SOXR_LINEAR_PHASE       0x00
+#define SOXR_INTERMEDIATE_PHASE 0x10
+#define SOXR_MINIMUM_PHASE      0x30
+
+#define SOXR_STEEP_FILTER       0x40
+
+
+
+SOXR soxr_runtime_spec_t soxr_runtime_spec(
+    unsigned num_threads);
+
+
+
+SOXR soxr_io_spec_t soxr_io_spec(
+    soxr_datatype_t itype,
+    soxr_datatype_t otype);
+
+
+
+/* --------------------------- Advanced use only ---------------------------- */
+
+/* For new designs, the following functions/usage will probably not be needed.
+ * They might be useful when adding soxr into an existing design where values
+ * for the resampling-rate and/or number-of-channels parameters to soxr_create
+ * are not available when that function will be called.  In such cases, the
+ * relevant soxr_create parameter(s) can be given as 0, then one or both of the
+ * following (as appropriate) later invoked (but prior to calling soxr_process
+ * or soxr_output):
+ *
+ * soxr_set_error(soxr, soxr_set_io_ratio(soxr, io_ratio, 0));
+ * soxr_set_error(soxr, soxr_set_num_channels(soxr, num_channels));
+ */
+
+SOXR soxr_error_t soxr_set_error(soxr_t, soxr_error_t);
+SOXR soxr_error_t soxr_set_num_channels(soxr_t, unsigned);
+
+
+
+#undef SOXR
+
+#if defined __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/soxr.pc.in b/src/soxr.pc.in
new file mode 100644
index 0000000..69d225b
--- /dev/null
+++ b/src/soxr.pc.in
@@ -0,0 +1,5 @@
+Name: ${PROJECT_NAME}
+Description: ${DESCRIPTION_SUMMARY}
+Version: ${PROJECT_VERSION}
+Libs: -L${LIB_INSTALL_DIR} -l${PROJECT_NAME}
+Cflags: -I${INCLUDE_INSTALL_DIR}
diff --git a/src/std-types.h b/src/std-types.h
new file mode 100644
index 0000000..c5e8636
--- /dev/null
+++ b/src/std-types.h
@@ -0,0 +1,48 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_std_types_included
+#define soxr_std_types_included
+
+#include "soxr-config.h"
+
+#include <limits.h>
+
+#if HAVE_STDBOOL_H
+  #include <stdbool.h>
+#else
+  #undef bool
+  #undef false
+  #undef true
+  #define bool int
+  #define false 0
+  #define true 1
+#endif
+
+#if HAVE_STDINT_H
+  #include <stdint.h>
+#else
+  #undef int16_t
+  #undef int32_t
+  #undef int64_t
+  #undef uint32_t
+  #undef uint64_t
+  #define int16_t short
+  #if LONG_MAX > 2147483647L
+    #define int32_t int
+    #define int64_t long
+  #elif LONG_MAX < 2147483647L
+  #error this library requires that 'long int' has at least 32-bits
+  #else
+    #define int32_t long
+    #if defined _MSC_VER
+      #define int64_t __int64
+    #else
+      #define int64_t long long
+    #endif
+  #endif
+  #define uint32_t unsigned int32_t
+  #define uint64_t unsigned int64_t
+#endif
+
+#endif
diff --git a/src/util-simd.c b/src/util-simd.c
new file mode 100644
index 0000000..ec548fd
--- /dev/null
+++ b/src/util-simd.c
@@ -0,0 +1,89 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <assert.h>
+#include <string.h>
+#include <stdlib.h>
+
+#include "soxr-config.h"
+
+#define SIMD_ALIGNMENT (sizeof(float) * (1 + (PFFFT_DOUBLE|AVCODEC_FOUND)) * 4)
+
+void * SIMD_ALIGNED_MALLOC(size_t size)
+{
+  char * p1 = 0, * p = malloc(size + SIMD_ALIGNMENT);
+  if (p) {
+    p1 = (char *)((size_t)(p + SIMD_ALIGNMENT) & ~(SIMD_ALIGNMENT - 1));
+    *((void * *)p1 - 1) = p;
+  }
+  return p1;
+}
+
+
+
+void * SIMD_ALIGNED_CALLOC(size_t nmemb, size_t size)
+{
+  void * p = SIMD_ALIGNED_MALLOC(nmemb * size);
+  if (p)
+    memset(p, 0, nmemb * size);
+  return p;
+}
+
+
+
+void SIMD_ALIGNED_FREE(void * p1)
+{
+  if (p1)
+    free(*((void * *)p1 - 1));
+}
+
+
+
+#define PFFT_MACROS_ONLY
+#include "pffft.c"
+
+
+
+void ORDERED_CONVOLVE_SIMD(int n, void * not_used, float * a, float const * b)
+{
+  int i;
+  float ab0, ab1;
+  v4sf       *   RESTRICT   va = (v4sf       *)a;
+  v4sf const *   RESTRICT   vb = (v4sf const *)b;
+  assert(VALIGNED(a) && VALIGNED(b));
+  ab0 = a[0] * b[0], ab1 = a[1] * b[1];
+  for (i = 0; i < n / 4; i += 2) {
+    v4sf a1r = va[i+0], a1i = va[i+1];
+    v4sf b1r = vb[i+0], b1i = vb[i+1];
+    UNINTERLEAVE2(a1r, a1i, a1r, a1i);
+    UNINTERLEAVE2(b1r, b1i, b1r, b1i);
+    VCPLXMUL(a1r, a1i, b1r, b1i);
+    INTERLEAVE2(a1r, a1i, a1r, a1i);
+    va[i+0] = a1r, va[i+1] = a1i;
+  }
+  a[0] = ab0, a[1] = ab1;
+  (void)not_used;
+}
+
+
+
+void ORDERED_PARTIAL_CONVOLVE_SIMD(int n, float * a, float const * b)
+{
+  int i;
+  float ab0;
+  v4sf       *   RESTRICT   va = (v4sf       *)a;
+  v4sf const *   RESTRICT   vb = (v4sf const *)b;
+  assert(VALIGNED(a) && VALIGNED(b));
+  ab0 = a[0] * b[0];
+  for (i = 0; i < n / 4; i += 2) {
+    v4sf a1r = va[i+0], a1i = va[i+1];
+    v4sf b1r = vb[i+0], b1i = vb[i+1];
+    UNINTERLEAVE2(a1r, a1i, a1r, a1i);
+    UNINTERLEAVE2(b1r, b1i, b1r, b1i);
+    VCPLXMUL(a1r, a1i, b1r, b1i);
+    INTERLEAVE2(a1r, a1i, a1r, a1i);
+    va[i+0] = a1r, va[i+1] = a1i;
+  }
+  a[0] = ab0;
+  a[1] = b[n] * a[n] - b[n+1] * a[n+1];
+}
diff --git a/src/util32s.c b/src/util32s.c
new file mode 100644
index 0000000..b9c9e08
--- /dev/null
+++ b/src/util32s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define PFFFT_DOUBLE 0
+
+#include "util32s.h"
+
+#include "util-simd.c"
diff --git a/src/util32s.h b/src/util32s.h
new file mode 100644
index 0000000..12226e5
--- /dev/null
+++ b/src/util32s.h
@@ -0,0 +1,23 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_util32s_included
+#define soxr_util32s_included
+
+#include <stddef.h>
+
+void * _soxr_simd32_aligned_malloc(size_t);
+void * _soxr_simd32_aligned_calloc(size_t, size_t);
+void _soxr_simd32_aligned_free(void *);
+
+#define SIMD_ALIGNED_MALLOC _soxr_simd32_aligned_malloc
+#define SIMD_ALIGNED_CALLOC _soxr_simd32_aligned_calloc
+#define SIMD_ALIGNED_FREE _soxr_simd32_aligned_free
+
+void _soxr_ordered_convolve_simd32(int n, void * not_used, float * a, float const * b);
+void _soxr_ordered_partial_convolve_simd32(int n, float * a, float const * b);
+
+#define ORDERED_CONVOLVE_SIMD _soxr_ordered_convolve_simd32
+#define ORDERED_PARTIAL_CONVOLVE_SIMD _soxr_ordered_partial_convolve_simd32
+
+#endif
diff --git a/src/util64s.c b/src/util64s.c
new file mode 100644
index 0000000..0faa9e9
--- /dev/null
+++ b/src/util64s.c
@@ -0,0 +1,8 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#define PFFFT_DOUBLE 1
+
+#include "util64s.h"
+
+#include "util-simd.c"
diff --git a/src/util64s.h b/src/util64s.h
new file mode 100644
index 0000000..7beeb89
--- /dev/null
+++ b/src/util64s.h
@@ -0,0 +1,23 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#if !defined soxr_util64s_included
+#define soxr_util64s_included
+
+#include <stddef.h>
+
+void * _soxr_simd64_aligned_malloc(size_t);
+void * _soxr_simd64_aligned_calloc(size_t, size_t);
+void _soxr_simd64_aligned_free(void *);
+
+#define SIMD_ALIGNED_MALLOC _soxr_simd64_aligned_malloc
+#define SIMD_ALIGNED_CALLOC _soxr_simd64_aligned_calloc
+#define SIMD_ALIGNED_FREE _soxr_simd64_aligned_free
+
+void _soxr_ordered_convolve_simd64(int n, void * not_used, double * a, double const * b);
+void _soxr_ordered_partial_convolve_simd64(int n, double * a, double const * b);
+
+#define ORDERED_CONVOLVE_SIMD _soxr_ordered_convolve_simd64
+#define ORDERED_PARTIAL_CONVOLVE_SIMD _soxr_ordered_partial_convolve_simd64
+
+#endif
diff --git a/src/vr-coefs.c b/src/vr-coefs.c
new file mode 100644
index 0000000..a57bec8
--- /dev/null
+++ b/src/vr-coefs.c
@@ -0,0 +1,115 @@
+/* SoX Resampler Library         Copyright (c) 2013 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Generate the filter coefficients for variable-rate resampling. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <math.h>
+#define PI 3.14159265358979323846            /* Since M_PI can't be relied on */
+
+static void print(double * h, int m, double l, char const * name)
+{                                                      /* Print out a filter: */
+  int i, N = l? (int)(l*m)-(l>1) : m, R=(N+1)/2;
+  int a = !l||l>1? 0:N-R, b = l>1? R:N;
+  printf("static float const %s[] = {\n", name);
+  if (l>1) printf(" 0.f,"); else if (!l) l=1;
+  for (i=a; h && i<b; ++i, printf("% .9gf,%c",l*h[i-1],"\n "[(i-a)&3 && i<b]));
+  puts("};\n");
+  free(h);
+}
+                                                  /* Parks McClellan FIR LPF: */
+#define even_adj(f) ((N&1)? 1 : cos(PI*.5*(f)))
+#define W(f) (((f) < Fp+1e-9? weight : 1) * even_adj(f))      /* Weighting fn */
+#define D(f) (((f) < Fp+1e-9) / even_adj(f))           /* Desired response fn */
+#define F(i) ((i) <= end[0]? (i)*inc[0] : 1-(end[1]-(i))*inc[1])
+#define EE(x,z) (_1 != x 1 && x E[i] > 0 && x E[i] >= x E[i z 1])
+#define PEAK do {if (k<NP+1) peak[k]=i; ++k,_1=(E[i]>0)-(E[i]<0);} while (0)
+
+typedef struct {double x, beta, gamma;} coef_t;
+
+static double amp_response(coef_t * coef, int R, double f, int i)
+{
+  double n = 0, d = 0, x = cos(PI*f), t;
+  for (; i < R; d += t = coef[i].beta / t, n += coef[i].gamma * t, ++i)
+    if (fabs(t = x - coef[i].x) < 1e-9) return coef[i].gamma;
+  return n/d;
+}
+
+static void fir(int m, double l, double Fp0, double Fs0,
+    double weight0, int density, char const * name)
+{
+  double Fp=Fp0/l, Fs=Fs0/l, weight=1/weight0, inc[2], Ws=1-Fs;
+  int N = (int)(l*m)-(l>1), R=(N+1)/2, NP=R+1, grid_size=1+density*R+1, pass=0;
+  int n1 = Ws>=(2*R-1)*Fp? 1:(int)(R*Fp/(Fp+Ws)+.5), n2=NP-n1, _1, i, j, k;
+  int    * peak = calloc(sizeof(*peak), (size_t)(NP+1)), * P=peak, end[2];
+  coef_t * coef = calloc(sizeof(*coef), (size_t)(NP));
+  float  * E    = calloc(sizeof(*E   ), (size_t)(grid_size));
+  double d, n, e, f, mult, delta, sum, hi, lo, * A = (double*)E, *h=0;
+
+  if (!P || !coef || !E) goto END;
+  end[0] = n1 * density, end[1] = grid_size-1;     /* Create prototype peaks: */
+  inc[0] = Fp/end[0],    inc[1] = n2==1? 0 : Ws / ((n2-1)*density);
+  for (i=0; i<n1; P[n1-1-i] = end[0] - i*density,++i);
+  for (i=0; i<n2; P[n1+i] = 1+end[0] + i*density,++i);
+
+  do {                                               /* Coefs for amp. resp.: */
+    for (i = 0; i<NP; coef[i].x = cos(PI*F(P[i])), ++i);
+    for (_1=-1, n=d=i=0; i < NP; ++i) {
+      for (mult = 1, j = 0; j < R; ++j) if (j != i) mult *= coef[i].x-coef[j].x;
+      if (mult) coef[i].beta = 1/mult; else goto END;
+      if (i != R) mult *= coef[i].x - coef[R].x;
+      f = F(P[i]), n += D(f)/mult, d += (_1=-_1)/(W(f)*mult);
+    }
+    for (delta = n/d, _1 = -1, i = 0; i < R; ++i)
+      f = F(P[i]), coef[i].gamma = D(f)-(_1=-_1)*delta/W(f);
+    for (i = 0; i <= end[1]; ++i)            /* Amplitude response and error: */
+      f = F(i), E[i] = (float)(W(f)*(D(f) - amp_response(coef, R, f, 0)));
+
+    i = k = _1 = 0;                                        /* Find new peaks: */
+    if (end[0]) if (EE(+,+) || EE(-,+)) PEAK;                       /* At F=0 */
+    for (++i, j = 0; j < 2; ++j) {                              /* In band j: */
+      for (; i < end[j]; ++i)
+        if ((EE(+,-) && E[i]>E[i+1]) || (EE(-,-) && E[i]<E[i+1])) PEAK;
+      if (!j) {PEAK; ++i; PEAK; ++i;}                           /* At Fp & Fs */
+    }
+    if (i==end[1]) if (EE(+,-) || EE(-,-)) PEAK;                    /* At F=1 */
+    if ((unsigned)(k = k-NP) > 1) goto END;                  /* Too many/few? */
+    P = peak + k * (fabs(E[peak[0]]) < fabs(E[peak[NP]]));         /* rm 1st? */
+
+    for (lo = hi = fabs(E[P[0]]), i=1; i<NP; ++i)              /* Converged?: */
+      e = fabs(E[P[i]]), lo = e<lo? e:lo, hi = e>hi? e:hi;
+  } while ((hi-lo)/hi > .001 && ++pass < 20);
+                      /* Create impulse response from final amp. resp. coefs: */
+  if (!(h = malloc(sizeof(*h)*(size_t)N))) goto END;
+  for (i = 0; i < R; f = 2.*i/N, A[i++] = amp_response(coef,R,f,0)*even_adj(f));
+  for (i = 0; i < R; h[N-1-i] = h[i] = sum/N, ++i)
+    for (sum=*A, j=1; j<R; sum += 2*cos(2*PI*(i-(N-1)/2.)/N*j)*A[j], ++j);
+  END: free(coef), free(E), free(peak);
+  print(h, m, l, name);
+}
+                                  /* Half-band IIR LPF (Mitra DSP 3/e, 13_9): */
+static void iir(int N, double Fp, char const * name)
+{
+  double d=tan(PI*.5*Fp), r=d*d, t=sqrt(1-r*r), n=(1-sqrt(t))/(1+sqrt(t))*.5;
+  double x=(n*n)*(n*n), Q=(((150*x+15)*x+2)*x+1)*n, q=pow(Q,.25), *h;
+  int i=0, j, _1;
+  if (!(h = malloc(sizeof(*h)*(size_t)N))) goto END;
+  for (; i<N; t=n*q/d, t=t*t, t=sqrt((1-t*r)*(1-t/r))/(1+t), h[i++]=(1-t)/(1+t))
+    for (_1=1, d=-.5, n=j=0, x=(i+1)*PI/(N+.5); j<7; ++j, _1=-_1)
+      n += _1*pow(Q,j*(j+1))*sin(x*(j+.5)), d += _1*pow(Q,j*j)*cos(x*j);
+  END: print(h, N, 0, name);
+}
+
+int main(int argc, char **argv)
+{
+  puts("/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net");
+  puts(" * Licence for this file: LGPL v2.1                  See LICENCE for details. */\n");
+
+  fir(241,  1, .45,  .5, 160, 32, "half_fir_coefs");
+  fir( 24, .5, .25,  .5,   1, 31, "fast_half_fir_coefs");
+  fir( 20, 12, .9 , 1.5, 160, 58, "coefs0_d");
+  fir( 12,  6, .45, 1.5,  80, 29, "coefs0_u");
+  iir( 15, .492, "iir_coefs");
+  return 0*argc*!argv;
+}
diff --git a/src/vr-coefs.h b/src/vr-coefs.h
new file mode 100644
index 0000000..e44138e
--- /dev/null
+++ b/src/vr-coefs.h
@@ -0,0 +1,94 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+static float const half_fir_coefs[] = {
+ 0.471112154f,  0.316907549f,  0.0286963396f, -0.101927032f,
+-0.0281272982f,  0.0568029535f,  0.027196876f, -0.0360795942f,
+-0.0259313561f,  0.023641162f,  0.0243660538f, -0.0151238564f,
+-0.0225440668f,  0.00886927471f,  0.0205146088f, -0.00411434209f,
+-0.0183312132f,  0.000458525335f,  0.0160497772f,  0.00233248286f,
+-0.0137265989f, -0.0044106884f,  0.011416442f,  0.005885487f,
+-0.00917074467f, -0.00684373006f,  0.00703601669f,  0.00736018933f,
+-0.00505250698f, -0.00750298261f,  0.00325317131f,  0.00733618346f,
+-0.00166298445f, -0.00692082025f,  0.000298598848f,  0.00631493711f,
+ 0.000831644129f, -0.0055731438f, -0.00172737872f,  0.00474591812f,
+ 0.0023955814f, -0.0038788491f, -0.00284969263f,  0.00301194082f,
+ 0.00310854264f, -0.00217906496f, -0.00319514679f,  0.00140761062f,
+ 0.00313542959f, -0.000718361916f, -0.00295694328f,  0.000125607323f,
+ 0.00268763625f,  0.000362527878f, -0.00235472525f, -0.000743552559f,
+ 0.00198371228f,  0.00101991741f, -0.0015975797f, -0.00119820218f,
+ 0.00121618271f,  0.0012882279f, -0.000855849209f, -0.00130214036f,
+ 0.000529184474f,  0.00125350876f, -0.000245067778f, -0.00115647977f,
+ 8.82118676e-06f,  0.00102502052f,  0.000177478031f, -0.000872275256f,
+-0.000314572995f,  0.000710055602f,  0.000405526007f, -0.000548470439f,
+-0.000455174442f,  0.000395698685f,  0.000469579667f, -0.000257895884f,
+-0.000455495078f,  0.000139222702f,  0.000419883982f, -4.19753541e-05f,
+-0.00036950051f, -3.32020844e-05f,  0.000310554015f,  8.7050045e-05f,
+-0.000248456595f, -0.000121389974f,  0.000187662656f,  0.000138813233f,
+-0.000131587954f, -0.000142374865f,  8.26090549e-05f,  0.000135318039f,
+-4.21208043e-05f, -0.000120830917f,  1.06505085e-05f,  0.00010185819f,
+ 1.20015129e-05f, -8.09558888e-05f, -2.65925299e-05f,  6.02101571e-05f,
+ 3.42775752e-05f, -4.11911155e-05f, -3.64462477e-05f,  2.49654252e-05f,
+ 3.46090513e-05f, -1.21078107e-05f, -3.03027209e-05f,  2.73562006e-06f,
+ 2.51329043e-05f,  3.66157998e-06f, -2.0990973e-05f, -9.38752332e-06f,
+ 2.07133365e-05f,  3.2060847e-05f,  1.98462364e-05f,  4.90328648e-06f,
+-5.28550107e-07f,
+};
+
+static float const fast_half_fir_coefs[] = {
+ 0.309418476f, -0.0819805418f,  0.0305513441f, -0.0101582224f,
+ 0.00251293175f, -0.000346895324f,
+};
+
+static float const coefs0_d[] = {
+ 0.f, 1.40520362e-05f,  2.32939994e-05f,  4.00699869e-05f,  6.18938797e-05f,
+ 8.79406317e-05f,  0.000116304226f,  0.000143862785f,  0.000166286173f,
+ 0.000178229431f,  0.00017374107f,  0.00014689118f,  9.25928444e-05f,
+ 7.55567388e-06f, -0.000108723934f, -0.000253061416f, -0.000417917952f,
+-0.000591117466f, -0.000756082504f, -0.000892686881f, -0.000978762367f,
+-0.000992225841f, -0.00091370246f, -0.000729430325f, -0.000434153678f,
+-3.36489703e-05f,  0.000453499646f,  0.000995243588f,  0.00154683724f,
+ 0.00205322353f,  0.00245307376f,  0.0026843294f,  0.0026908874f,
+ 0.00242986868f,  0.00187874742f,  0.00104150259f, -4.70759945e-05f,
+-0.00131972748f, -0.00267834298f, -0.00399923407f, -0.00514205849f,
+-0.00596200535f, -0.00632441105f, -0.00612058374f, -0.00528328869f,
+-0.00380015804f, -0.0017232609f,  0.000826765169f,  0.0036632503f,
+ 0.00654337507f,  0.00918536843f,  0.0112922007f,  0.0125801323f,
+ 0.0128097433f,  0.0118164904f,  0.00953750551f,  0.00603133188f,
+ 0.00148762708f, -0.00377544588f, -0.009327395f, -0.014655127f,
+-0.0192047839f, -0.0224328082f, -0.0238620596f, -0.0231377935f,
+-0.0200777417f, -0.0147104883f, -0.00729690011f,  0.0016694689f,
+ 0.0114853672f,  0.02128446f,  0.0301054204f,  0.03697694f,
+ 0.0410129138f,  0.0415093321f,  0.0380333749f,  0.0304950299f,
+ 0.0191923285f,  0.00482304203f, -0.0115416941f, -0.0285230397f,
+-0.0445368533f, -0.0579264573f, -0.0671158215f, -0.070770308f,
+-0.0679502076f, -0.0582416438f, -0.0418501969f, -0.0196448429f,
+ 0.00685658762f,  0.0355644891f,  0.0639556622f,  0.0892653703f,
+ 0.108720484f,  0.11979613f,  0.120474745f,  0.109484562f,
+ 0.0864946948f,  0.0522461633f,  0.00860233712f, -0.041491734f,
+-0.0941444939f, -0.144742955f, -0.188255118f, -0.219589829f,
+-0.233988169f, -0.227416437f, -0.196929062f, -0.140970726f,
+-0.0595905561f,  0.0454527813f,  0.170708227f,  0.311175511f,
+ 0.460568159f,  0.61168037f,  0.756833088f,  0.888367707f,
+ 0.999151395f,  1.08305644f,  1.13537741f,  1.15315438f,
+};
+
+static float const coefs0_u[] = {
+ 0.f, 2.4378013e-05f,  9.70782157e-05f,  0.000256572953f,  0.000527352928f,
+ 0.000890796838f,  0.00124949518f,  0.00140604793f,  0.00107945998f,
+-2.15586031e-05f, -0.00206589462f, -0.00493342625f, -0.00807135101f,
+-0.0104515787f, -0.0107039866f, -0.00746258988f,  0.000109078838f,
+ 0.0117345872f,  0.0255795186f,  0.0381690155f,  0.0448461522f,
+ 0.0408218138f,  0.0226797758f, -0.00999595371f, -0.0533441602f,
+-0.0987927774f, -0.133827418f, -0.144042973f, -0.116198269f,
+-0.0416493482f,  0.0806808506f,  0.242643854f,  0.427127981f,
+ 0.610413245f,  0.766259257f,  0.8708884f,  0.907742029f,
+};
+
+static float const iir_coefs[] = {
+ 0.0262852045f,  0.0998310478f,  0.206865061f,  0.330224134f,
+ 0.454420362f,  0.568578357f,  0.666944466f,  0.747869771f,
+ 0.812324404f,  0.8626001f,  0.901427744f,  0.931486057f,
+ 0.955191529f,  0.974661783f,  0.991776305f,
+};
+
diff --git a/src/vr32.c b/src/vr32.c
new file mode 100644
index 0000000..8b1a259
--- /dev/null
+++ b/src/vr32.c
@@ -0,0 +1,651 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Variable-rate resampling. */
+
+#include <assert.h>
+#include "math-wrap.h"
+#include <string.h>
+#include <stdlib.h>
+#include "internal.h"
+#define FIFO_SIZE_T int
+#define FIFO_MIN 0x8000
+#include "fifo.h"
+#include "vr-coefs.h"
+
+#define FADE_LEN_BITS     9
+#define PHASE_BITS_D      10
+#define PHASE_BITS_U      9
+
+#define PHASES0_D         12
+#define POLY_FIR_LEN_D    20
+#define PHASES0_U         6
+#define POLY_FIR_LEN_U    12
+
+#define MULT32            (65536. * 65536.)
+#define PHASES_D          (1 << PHASE_BITS_D)
+#define PHASES_U          (1 << PHASE_BITS_U)
+
+#define CONVOLVE \
+    _ _ _ _ _ _ _ _ _ _  _ _ _ _ _ _ _ _ _ _ \
+    _ _ _ _ _ _ _ _ _ _  _ _ _ _ _ _ _ _ _ _ \
+    _ _ _ _ _ _ _ _ _ _  _ _ _ _ _ _ _ _ _ _
+
+#define HALF_FIR_LEN_2 (iAL(half_fir_coefs) - 1)
+#define HALF_FIR_LEN_4 (HALF_FIR_LEN_2 / 2)
+
+#define _ sum += (input[-i] + input[i]) * half_fir_coefs[i], ++i;
+static float half_fir(float const * input)
+{
+  long i = 1;
+  float sum = input[0] * half_fir_coefs[0];
+  CONVOLVE CONVOLVE
+  assert(i == HALF_FIR_LEN_2 + 1);
+  return (float)sum;
+}
+#undef _
+
+#define _ sum += (input[-i] + input[i]) * half_fir_coefs[2*i], ++i;
+static float double_fir0(float const * input)
+{
+  int i = 1;
+  float sum = input[0] * half_fir_coefs[0];
+  CONVOLVE
+  assert(i == HALF_FIR_LEN_4 + 1);
+  return (float)(sum * 2);
+}
+#undef _
+
+#define _ sum += (input[-i] + input[1+i]) * half_fir_coefs[2*i+1], ++i;
+static float double_fir1(float const * input)
+{
+  int i = 0;
+  float sum = 0;
+  CONVOLVE
+  assert(i == HALF_FIR_LEN_4 + 0);
+  return (float)(sum * 2);
+}
+#undef _
+
+static float fast_half_fir(float const * input)
+{
+  int i = 0;
+  float sum = input[0] * .5f;
+#define _ sum += (input[-(2*i+1)] + input[2*i+1]) * fast_half_fir_coefs[i], ++i;
+  _ _ _ _ _ _
+#undef _
+  return (float)sum;
+}
+
+#define IIR_FILTER _ _ _ _ _ _ _
+#define _ in1=(in1-p->y[i])*iir_coefs[i]+tmp1;tmp1=p->y[i],p->y[i]=in1;++i;\
+          in0=(in0-p->y[i])*iir_coefs[i]+tmp0;tmp0=p->y[i],p->y[i]=in0;++i;
+
+typedef struct {float x[2], y[AL(iir_coefs)];} half_iir_t;
+
+static float half_iir1(half_iir_t * p, float in0, float in1)
+{
+  int i = 0;
+  float tmp0, tmp1;
+  tmp0 = p->x[0], p->x[0] = in0;
+  tmp1 = p->x[1], p->x[1] = in1;
+  IIR_FILTER
+  p->y[i] = in1 = (in1 - p->y[i]) * iir_coefs[i] + tmp1;
+  return in1 + in0;
+}
+#undef _
+
+static void half_iir(half_iir_t * p, float * obuf, float const * ibuf, int olen)
+{
+  int i;
+  for (i=0; i < olen; obuf[i] = (float)half_iir1(p, ibuf[i*2], ibuf[i*2+1]),++i);
+}
+
+static void half_phase(half_iir_t * p, float * buf, int len)
+{
+  float const small_normal = 1/MULT32/MULT32; /* To quash denormals on path 0.*/
+  int i;
+  for (i = 0; i < len; buf[i] = (float)half_iir1(p, buf[i], 0), ++i);
+#define _ p->y[i] += small_normal, i += 2;
+  i = 0, _ IIR_FILTER
+#undef _
+#define _ p->y[i] -= small_normal, i += 2;
+  i = 0, _ IIR_FILTER
+#undef _
+}
+
+#define coef(coef_p, interp_order, fir_len, phase_num, coef_interp_num, \
+    fir_coef_num) coef_p[(fir_len) * ((interp_order) + 1) * (phase_num) + \
+    ((interp_order) + 1) * (fir_coef_num) + (interp_order - coef_interp_num)]
+
+#define COEF(h,l,i) ((i)<0||(i)>=(l)?0:(h)[(i)>(l)/2?(l)-(i):(i)])
+static void prepare_coefs(float * coefs, int n, int phases0, int phases,
+    float const * coefs0, double multiplier)
+{
+  double k[6];
+  int length0 = n * phases0, length = n * phases, K0 = iAL(k)/2 - 1, i, j, pos;
+  float * coefs1 = malloc(((size_t)length / 2  + 1) * sizeof(*coefs1));
+  float * p = coefs1, f0, f1 = 0;
+
+  for (j = 0; j < iAL(k); k[j] = COEF(coefs0, length0, j - K0), ++j);
+  for (pos = i = 0; i < length0 / 2; ++i) {
+    double b=(1/24.)*(k[0]+k[4]+6*k[2]-4*(k[1]+k[3])),d=.5*(k[1]+k[3])-k[2]-b;
+    double a=(1/120.)*(k[5]-k[2]-9*(9*b+d)+2.5*(k[3]-k[1])-2*(k[4]-k[0]));
+    double c=(1/12.)*(k[4]-k[0]-2*(k[3]-k[1])-60*a),e=.5*(k[3]-k[1])-a-c;
+    for (; pos / phases == i; pos += phases0) {
+      double x = (double)(pos % phases) / phases;
+      *p++ = (float)(k[K0] + ((((a*x + b)*x + c)*x + d)*x + e)*x);
+    }
+    for (j = 0; j < iAL(k) - 1; k[j] = k[j + 1], ++j);
+    k[j] = COEF(coefs0, length0, i + iAL(k) / 2 + 1);
+  }
+  if (!(length & 1))
+    *p++ = (float)k[K0];
+  assert(p - coefs1 == length / 2  + 1);
+
+  for (i = 0; i < n; ++i) for (j = phases - 1; j >= 0; --j, f1 = f0) {
+    pos = (n - 1 - i) * phases + j;
+    f0 = COEF(coefs1, length, pos) * (float)multiplier;
+    coef(coefs, 1, n, j, 0, i) = (float)f0;
+    coef(coefs, 1, n, j, 1, i) = (float)(f1 - f0);
+  }
+  free(coefs1);
+}
+
+#define _ sum += (b *x + a)*input[i], ++i;
+#define a (coef(poly_fir_coefs_d, 1, POLY_FIR_LEN_D, phase, 0,i))
+#define b (coef(poly_fir_coefs_d, 1, POLY_FIR_LEN_D, phase, 1,i))
+static float poly_fir_coefs_d[POLY_FIR_LEN_D * PHASES_D * 2];
+
+static float poly_fir1_d(float const * input, uint32_t frac)
+{
+  int i = 0, phase = (int)(frac >> (32 - PHASE_BITS_D));
+  float sum = 0, x = (float)(frac << PHASE_BITS_D) * (float)(1 / MULT32);
+  _ _ _ _ _  _ _ _ _ _  _ _ _ _ _  _ _ _ _ _
+  assert(i == POLY_FIR_LEN_D);
+  return (float)sum;
+}
+#undef a
+#undef b
+#define a (coef(poly_fir_coefs_u, 1, POLY_FIR_LEN_U, phase, 0,i))
+#define b (coef(poly_fir_coefs_u, 1, POLY_FIR_LEN_U, phase, 1,i))
+static float poly_fir_coefs_u[POLY_FIR_LEN_U * PHASES_U * 2];
+
+static float poly_fir1_u(float const * input, uint32_t frac)
+{
+  int i = 0, phase = (int)(frac >> (32 - PHASE_BITS_U));
+  float sum = 0, x = (float)(frac << PHASE_BITS_U) * (float)(1 / MULT32);
+  _ _ _ _ _  _ _ _ _ _  _ _
+  assert(i == POLY_FIR_LEN_U);
+  return (float)sum;
+}
+#undef a
+#undef b
+#undef _
+
+#define ADD_TO(x,y)           x.all += y.all
+#define SUBTRACT_FROM(x,y)    x.all -= y.all
+#define FRAC(x)               x.part.frac
+#define INT(x)                x.part.integer
+
+typedef struct {
+  union {
+    int64_t all;
+#if HAVE_BIGENDIAN
+    struct {int32_t integer; uint32_t frac;} part;
+#else
+    struct {uint32_t frac; int32_t integer;} part;
+#endif
+  } at, step, step_step;
+  float const * input;
+  int len, stage_num;
+  bool is_d; /* true: downsampling at x2 rate; false: upsampling at 1x rate. */
+  double step_mult;
+} stream_t;
+
+static int poly_fir_d(stream_t * s, float * output, int olen)
+{
+  int i;
+  float const * input = s->input - POLY_FIR_LEN_D / 2 + 1;
+  for (i = 0; i < olen && INT(s->at) < s->len; ++i) {
+    output[i] = poly_fir1_d(input + INT(s->at), FRAC(s->at));
+    ADD_TO(s->at, s->step);
+    if (!(INT(s->at) < s->len)) {
+      SUBTRACT_FROM(s->at, s->step);
+      break;
+    }
+    output[++i] = poly_fir1_d(input + INT(s->at), FRAC(s->at));
+    ADD_TO(s->at, s->step);
+    ADD_TO(s->step, s->step_step);
+  }
+  return i;
+}
+
+static int poly_fir_fade_d(
+    stream_t * s, float const * vol, int step, float * output, int olen)
+{
+  int i;
+  float const * input = s->input - POLY_FIR_LEN_D / 2 + 1;
+  for (i = 0; i < olen && INT(s->at) < s->len; ++i, vol += step) {
+    output[i] += *vol * poly_fir1_d(input + INT(s->at), FRAC(s->at));
+    ADD_TO(s->at, s->step);
+    if (!(INT(s->at) < s->len)) {
+      SUBTRACT_FROM(s->at, s->step);
+      break;
+    }
+    output[++i] += *(vol += step) * poly_fir1_d(input + INT(s->at),FRAC(s->at));
+    ADD_TO(s->at, s->step);
+    ADD_TO(s->step, s->step_step);
+  }
+  return i;
+}
+
+static int poly_fir_u(stream_t * s, float * output, int olen)
+{
+  int i;
+  float const * input = s->input - POLY_FIR_LEN_U / 2 + 1;
+  for (i = 0; i < olen && INT(s->at) < s->len; ++i) {
+    output[i] = poly_fir1_u(input + INT(s->at), FRAC(s->at));
+    ADD_TO(s->at, s->step);
+    ADD_TO(s->step, s->step_step);
+  }
+  return i;
+}
+
+static int poly_fir_fade_u(
+    stream_t * s, float const * vol, int step, float * output, int olen)
+{
+  int i;
+  float const * input = s->input - POLY_FIR_LEN_U / 2 + 1;
+  for (i = 0; i < olen && INT(s->at) < s->len; i += 2, vol += step) {
+    output[i] += *vol * poly_fir1_u(input + INT(s->at), FRAC(s->at));
+    ADD_TO(s->at, s->step);
+    ADD_TO(s->step, s->step_step);
+  }
+  return i;
+}
+
+#define shiftr(x,by) ((by) < 0? (x) << (-(by)) : (x) >> (by))
+#define shiftl(x,by) shiftr(x,-(by))
+#define stage_occupancy(s) (fifo_occupancy(&(s)->fifo) - 4*HALF_FIR_LEN_2)
+#define stage_read_p(s) ((float *)fifo_read_ptr(&(s)->fifo) + 2*HALF_FIR_LEN_2)
+#define stage_preload(s) memset(fifo_reserve(&(s)->fifo, (s)->preload), \
+    0, sizeof(float) * (size_t)(s)->preload);
+
+typedef struct {
+  fifo_t fifo;
+  double step_mult;
+  int is_fast, x_fade_len, preload;
+} stage_t;
+
+typedef struct {
+  int num_stages0, num_stages, flushing;
+  int fade_len, slew_len, xfade, stage_inc, switch_stage_num;
+  double new_io_ratio, default_io_ratio;
+  stage_t * stages;
+  fifo_t output_fifo;
+  half_iir_t halfer;
+  stream_t current, fadeout; /* Current/fade-in, fadeout streams. */
+} rate_t;
+
+static float fade_coefs[(2 << FADE_LEN_BITS) + 1];
+
+static void vr_init(rate_t * p, double default_io_ratio, int num_stages, double mult)
+{
+  int i;
+  assert(num_stages >= 0);
+  memset(p, 0, sizeof(*p));
+
+  p->num_stages0 = num_stages;
+  p->num_stages = num_stages = max(num_stages, 1);
+  p->stages = (stage_t *)calloc((unsigned)num_stages + 1, sizeof(*p->stages)) + 1;
+  for (i = -1; i < p->num_stages; ++i) {
+    stage_t * s = &p->stages[i];
+    fifo_create(&s->fifo, sizeof(float));
+    s->step_mult = 2 * MULT32 / shiftl(2, i);
+    s->preload = i < 0? 0 : i == 0? 2 * HALF_FIR_LEN_2 : 3 * HALF_FIR_LEN_2 / 2;
+    stage_preload(s);
+    s->is_fast = true;
+    lsx_debug("%-3i preload=%i", i, s->preload);
+  }
+  fifo_create(&p->output_fifo, sizeof(float));
+  p->default_io_ratio = default_io_ratio;
+  if (fade_coefs[0]==0) {
+    for (i = 0; i < iAL(fade_coefs); ++i)
+      fade_coefs[i] = (float)(.5 * (1 + cos(M_PI * i / (AL(fade_coefs) - 1))));
+    prepare_coefs(poly_fir_coefs_u, POLY_FIR_LEN_U, PHASES0_U, PHASES_U, coefs0_u, mult);
+    prepare_coefs(poly_fir_coefs_d, POLY_FIR_LEN_D, PHASES0_D, PHASES_D, coefs0_d, mult *.5);
+  }
+  assert(fade_coefs[0]);
+}
+
+static void enter_new_stage(rate_t * p, int occupancy0)
+{
+  p->current.len = shiftr(occupancy0, p->current.stage_num);
+  p->current.input = stage_read_p(&p->stages[p->current.stage_num]);
+
+  p->current.step_mult = p->stages[p->current.stage_num].step_mult;
+  p->current.is_d = p->current.stage_num >= 0;
+  if (p->current.is_d)
+    p->current.step_mult *= .5;
+}
+
+static void set_step(stream_t * p, double io_ratio)
+{
+  p->step.all = (int64_t)(io_ratio * p->step_mult + .5);
+}
+
+static bool set_step_step(stream_t * p, double io_ratio, int slew_len)
+{
+  int64_t dif;
+  int difi;
+  stream_t tmp = *p;
+  set_step(&tmp, io_ratio);
+  dif = tmp.step.all - p->step.all;
+  dif = dif < 0? dif - (slew_len >> 1) : dif + (slew_len >> 1);
+  difi = (int)dif;   /* Try to avoid int64_t div. */
+  p->step_step.all = difi == dif? difi / slew_len : dif / slew_len;
+  return p->step_step.all != 0;
+}
+
+static void vr_set_io_ratio(rate_t * p, double io_ratio, size_t slew_len)
+{
+  assert(io_ratio > 0);
+  if (slew_len) {
+    if (!set_step_step(&p->current, io_ratio, p->slew_len = (int)slew_len))
+      p->slew_len = 0, p->new_io_ratio = 0, p->fadeout.step_step.all = 0;
+    else {
+      p->new_io_ratio = io_ratio;
+      if (p->fade_len)
+        set_step_step(&p->fadeout, io_ratio, p->slew_len);
+    }
+  }
+  else {
+    if (p->default_io_ratio!=0) { /* Then this is the first call to this fn. */
+      int octave = (int)floor(log(io_ratio) / M_LN2);
+      p->current.stage_num = octave < 0? -1 : min(octave, p->num_stages0-1);
+      enter_new_stage(p, 0);
+    }
+    else if (p->fade_len)
+      set_step(&p->fadeout, io_ratio);
+    set_step(&p->current, io_ratio);
+    if (p->default_io_ratio!=0) FRAC(p->current.at) = FRAC(p->current.step) >> 1;
+    p->default_io_ratio = 0;
+  }
+}
+
+static bool do_input_stage(rate_t * p, int stage_num, int sign, int min_stage_num)
+{
+  int i = 0;
+  float * dest;
+  stage_t * s = &p->stages[stage_num];
+  stage_t * s1 = &p->stages[stage_num - sign];
+  float const * src = (float *)fifo_read_ptr(&s1->fifo) + HALF_FIR_LEN_2;
+  int len = shiftr(fifo_occupancy(&s1->fifo) - HALF_FIR_LEN_2 * 2, sign);
+  int already_done = fifo_occupancy(&s->fifo) - s->preload;
+  if ((len -= already_done) <= 0)
+    return false;
+  src += shiftl(already_done, sign);
+
+  dest = fifo_reserve(&s->fifo, len);
+  if (stage_num < 0) for (; i < len; ++src)
+    dest[i++] = double_fir0(src), dest[i++] = double_fir1(src);
+  else {
+    bool should_be_fast = p->stage_inc;
+    if (!s->x_fade_len && stage_num == p->switch_stage_num) {
+      p->switch_stage_num = 0;
+      if (s->is_fast != should_be_fast) {
+        s->x_fade_len = 1 << FADE_LEN_BITS, s->is_fast = should_be_fast, ++p->xfade;
+        lsx_debug("xfade level %i, inc?=%i", stage_num, p->stage_inc);
+      }
+    }
+    if (s->x_fade_len) {
+      float const * vol1 = fade_coefs + (s->x_fade_len << 1);
+      float const * vol2 = fade_coefs + (((1 << FADE_LEN_BITS) - s->x_fade_len) << 1);
+      int n = min(len, s->x_fade_len);
+      /*lsx_debug("xfade level %i, inc?=%i len=%i n=%i", stage_num, p->stage_inc, s->x_fade_len, n);*/
+      if (should_be_fast)
+        for (; i < n; vol2 += 2, vol1 -= 2, src += 2)
+          dest[i++] = *vol1 * fast_half_fir(src) + *vol2 * half_fir(src);
+      else for (; i < n; vol2 += 2, vol1 -= 2, src += 2)
+        dest[i++] = *vol2 * fast_half_fir(src) + *vol1 * half_fir(src);
+      s->x_fade_len -= n;
+      p->xfade -= !s->x_fade_len;
+    }
+    if (stage_num < min_stage_num)
+      for (; i < len; dest[i++] = fast_half_fir(src), src += 2);
+    else for (; i < len; dest[i++] = half_fir(src), src += 2);
+  }
+  if (p->flushing > 0)
+    stage_preload(s);
+  return true;
+}
+
+static int vr_process(rate_t * p, int olen0)
+{
+  assert(p->num_stages > 0);
+  if (p->default_io_ratio!=0)
+    vr_set_io_ratio(p, p->default_io_ratio, 0);
+  {
+    float * output = fifo_reserve(&p->output_fifo, olen0);
+    int j, odone0 = 0, min_stage_num = p->current.stage_num;
+    int occupancy0, max_stage_num = min_stage_num;
+    if (p->fade_len) {
+      min_stage_num = min(min_stage_num, p->fadeout.stage_num);
+      max_stage_num = max(max_stage_num, p->fadeout.stage_num);
+    }
+
+    for (j = min(min_stage_num, 0); j <= max_stage_num; ++j)
+      if (j && !do_input_stage(p, j, j < 0? -1 : 1, min_stage_num))
+        break;
+    if (p->flushing > 0)
+      p->flushing = -1;
+
+    occupancy0 = shiftl(max(0,stage_occupancy(&p->stages[max_stage_num])), max_stage_num);
+    p->current.len = shiftr(occupancy0, p->current.stage_num);
+    p->current.input = stage_read_p(&p->stages[p->current.stage_num]);
+    if (p->fade_len) {
+      p->fadeout.len = shiftr(occupancy0, p->fadeout.stage_num);
+      p->fadeout.input = stage_read_p(&p->stages[p->fadeout.stage_num]);
+    }
+
+    while (odone0 < olen0) {
+      int odone, odone2, olen = olen0 - odone0, stage_dif = 0, shift;
+      float buf[64 << 1];
+
+      olen = min(olen, (int)(AL(buf) >> 1));
+      if (p->slew_len)
+        olen = min(olen, p->slew_len);
+      else if (p->new_io_ratio!=0) {
+        set_step(&p->current, p->new_io_ratio);
+        set_step(&p->fadeout, p->new_io_ratio);
+        p->fadeout.step_step.all = p->current.step_step.all = 0;
+        p->new_io_ratio = 0;
+      }
+      if (!p->flushing && !p->fade_len && !p->xfade) {
+        if (p->current.is_d) {
+          if (INT(p->current.step) && FRAC(p->current.step))
+            stage_dif = 1, ++max_stage_num;
+          else if (!INT(p->current.step) && FRAC(p->current.step) < (1u << 31))
+            stage_dif = -1, --min_stage_num;
+        } else if (INT(p->current.step) > 1 && FRAC(p->current.step))
+          stage_dif = 1, ++max_stage_num;
+      }
+      if (stage_dif) {
+        int n = p->current.stage_num + stage_dif;
+        if (n >= p->num_stages)
+          --max_stage_num;
+        else {
+          p->stage_inc = stage_dif > 0;
+          p->fadeout = p->current;
+          p->current.stage_num += stage_dif;
+          if (!p->stage_inc)
+          p->switch_stage_num = p->current.stage_num;
+          if ((p->current.stage_num < 0 && stage_dif < 0) ||
+              (p->current.stage_num > 0 && stage_dif > 0)) {
+            stage_t * s = &p->stages[p->current.stage_num];
+            fifo_clear(&s->fifo);
+            stage_preload(s);
+            s->is_fast = false;
+            do_input_stage(p, p->current.stage_num, stage_dif, p->current.stage_num);
+          }
+          if (p->current.stage_num > 0 && stage_dif < 0) {
+            int idone = INT(p->current.at);
+            stage_t * s = &p->stages[p->current.stage_num];
+            fifo_trim_to(&s->fifo, 2 * HALF_FIR_LEN_2 + idone + (POLY_FIR_LEN_D >> 1));
+            do_input_stage(p, p->current.stage_num, 1, p->current.stage_num);
+          }
+          enter_new_stage(p, occupancy0);
+          shift = -stage_dif;
+#define lshift(x,by) (x)=(by)>0?(x)<<(by):(x)>>-(by)
+          lshift(p->current.at.all, shift);
+          shift += p->fadeout.is_d - p->current.is_d;
+          lshift(p->current.step.all, shift);
+          lshift(p->current.step_step.all, shift);
+          p->fade_len = AL(fade_coefs) - 1;
+          lsx_debug("switch from stage %i to %i, x2 from %i to %i", p->fadeout.stage_num, p->current.stage_num, p->fadeout.is_d, p->current.is_d);
+        }
+      }
+
+      if (p->fade_len) {
+        float const * vol1 = fade_coefs + p->fade_len;
+        float const * vol2 = fade_coefs + (iAL(fade_coefs) - 1 - p->fade_len);
+        int olen2 = (olen = min(olen, p->fade_len >> 1)) << 1;
+
+        /* x2 is more fine-grained so may fail to produce a pair of samples
+         * where x1 would not (the x1 second sample is a zero so is always
+         * available).  So do x2 first, then feed odone to the second one. */
+        memset(buf, 0, sizeof(*buf) * (size_t)olen2);
+        if (p->current.is_d && p->fadeout.is_d) {
+          odone  = poly_fir_fade_d(&p->current, vol1,-1, buf, olen2);
+          odone2 = poly_fir_fade_d(&p->fadeout, vol2, 1, buf, odone);
+        } else if (p->current.is_d) {
+          odone  = poly_fir_fade_d(&p->current, vol1,-1, buf, olen2);
+          odone2 = poly_fir_fade_u(&p->fadeout, vol2, 2, buf, odone);
+        } else {
+          assert(p->fadeout.is_d);
+          odone  = poly_fir_fade_d(&p->fadeout, vol2, 1, buf, olen2);
+          odone2 = poly_fir_fade_u(&p->current, vol1,-2, buf, odone);
+        }
+        assert(odone == odone2);
+        (void)odone2;
+        p->fade_len -= odone;
+        if (!p->fade_len) {
+          if (p->stage_inc)
+            p->switch_stage_num = min_stage_num++;
+          else
+            --max_stage_num;
+        }
+        half_iir(&p->halfer, &output[odone0], buf, odone >>= 1);
+      }
+      else if (p->current.is_d) {
+        odone = poly_fir_d(&p->current, buf, olen << 1) >> 1;
+        half_iir(&p->halfer, &output[odone0], buf, odone);
+      }
+      else {
+        odone = poly_fir_u(&p->current, &output[odone0], olen);
+        if (p->num_stages0)
+          half_phase(&p->halfer, &output[odone0], odone);
+      }
+      odone0 += odone;
+      if (p->slew_len)
+        p->slew_len -= odone;
+      if (odone != olen)
+        break; /* Need more input. */
+    } {
+      int from = max(0, max_stage_num), to = min(0, min_stage_num);
+      int i, idone = shiftr(INT(p->current.at), from - p->current.stage_num);
+      INT(p->current.at) -= shiftl(idone, from - p->current.stage_num);
+      if (p->fade_len)
+        INT(p->fadeout.at) -= shiftl(idone, from - p->fadeout.stage_num);
+      for (i = from; i >= to; --i, idone <<= 1)
+        fifo_read(&p->stages[i].fifo, idone, NULL);
+    }
+    fifo_trim_by(&p->output_fifo, olen0 - odone0);
+    return odone0;
+  }
+}
+
+static float * vr_input(rate_t * p, float const * input, size_t n)
+{
+  return fifo_write(&p->stages[0].fifo, (int)n, input);
+}
+
+static float const * vr_output(rate_t * p, float * output, size_t * n)
+{
+  fifo_t * fifo = &p->output_fifo;
+  if (1 || !p->num_stages0)
+    return fifo_read(fifo, (int)(*n = min(*n, (size_t)fifo_occupancy(fifo))), output);
+  else { /* Ignore this complication for now. */
+    int const IIR_DELAY = 2;
+    float * ptr = fifo_read_ptr(fifo);
+    int olen = min((int)*n, max(0, fifo_occupancy(fifo) - IIR_DELAY));
+    *n = (size_t)olen;
+    if (output)
+      memcpy(output, ptr + IIR_DELAY, *n * sizeof(*output));
+    fifo_read(fifo, olen, NULL);
+    return ptr + IIR_DELAY;
+  }
+}
+
+static void vr_flush(rate_t * p)
+{
+  if (!p->flushing) {
+    stage_preload(&p->stages[0]);
+    ++p->flushing;
+  }
+}
+
+static void vr_close(rate_t * p)
+{
+  int i;
+
+  fifo_delete(&p->output_fifo);
+  for (i = -1; i < p->num_stages; ++i) {
+    stage_t * s = &p->stages[i];
+    fifo_delete(&s->fifo);
+  }
+  free(p->stages - 1);
+}
+
+static double vr_delay(rate_t * p)
+{
+  return 100; /* TODO */
+  (void)p;
+}
+
+static void vr_sizes(size_t * shared, size_t * channel)
+{
+  *shared = 0;
+  *channel = sizeof(rate_t);
+}
+
+static char const * vr_create(void * channel, void * shared,double max_io_ratio,
+    void * q_spec, void * r_spec, double scale)
+{
+  double x = max_io_ratio;
+  int n;
+  for (n = 0; x > 1; x *= .5, ++n);
+  vr_init(channel, max_io_ratio, n, scale);
+  return 0;
+  (void)shared, (void)q_spec, (void)r_spec;
+}
+
+static char const * vr_id(void)
+{
+  return "vr32";
+}
+
+typedef void (* fn_t)(void);
+fn_t _soxr_vr32_cb[] = {
+  (fn_t)vr_input,
+  (fn_t)vr_process,
+  (fn_t)vr_output,
+  (fn_t)vr_flush,
+  (fn_t)vr_close,
+  (fn_t)vr_delay,
+  (fn_t)vr_sizes,
+  (fn_t)vr_create,
+  (fn_t)vr_set_io_ratio,
+  (fn_t)vr_id,
+};
diff --git a/tests/1-delay-clear.c b/tests/1-delay-clear.c
new file mode 100644
index 0000000..ba4d47c
--- /dev/null
+++ b/tests/1-delay-clear.c
@@ -0,0 +1,64 @@
+/* SoX Resampler Library      Copyright (c) 2007-15 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Test 1: exercises soxr_delay and soxr_clear */
+
+#ifdef NDEBUG /* N.B. assert used with active statements so enable always. */
+#undef NDEBUG /* Must undef above assert.h or other that might include it. */
+#endif
+
+#include <soxr.h>
+#include "../examples/examples-common.h"
+
+#define ranqd1(x) ((x) = 1664525 * (x) + 1013904223) /* int32_t x */
+#define franqd1(x) (float)(ranqd1(x) * (1. / (65536. * 32768.))) /* [-1,1) */
+
+#define irate 9600
+#define orate 4410
+
+int main(int argc, char const * arg[])
+{
+  soxr_error_t error;
+  int32_t ran = 0;
+  int j;
+
+  soxr_t soxr = soxr_create(irate, orate, 1, &error, NULL, NULL, NULL);
+  assert(!error);
+
+  for (j=0; j<2; ++j) {
+    float ibuf[irate], out[orate+2], obuf[orate+2], * ibuf1 = ibuf;
+    size_t ilen = AL(ibuf)-1, olen = AL(obuf), i, odone = 0, odone0, odone1=0;
+    soxr_quality_spec_t  q_spec = soxr_quality_spec(SOXR_HQ, 0);
+
+    for (i=0; i<irate; ibuf[i++] = franqd1(ran));
+
+    error = soxr_oneshot(irate, orate, 1, ibuf, ilen, NULL,
+        out, AL(out), &odone0, NULL, &q_spec, NULL);
+    assert(!error);
+    assert(odone0==orate);
+
+    for (i=0; ilen || odone1; ++i) {
+      double out_samples = (double)orate / irate * (double)ilen;
+      double delayed_samples = soxr_delay(soxr);
+      unsigned max_out_samples = (unsigned)(out_samples + delayed_samples + .5);
+      assert(delayed_samples >= 0);
+      fprintf(stderr, "%5u %5u %5u\n",
+          (unsigned)ilen, max_out_samples, (unsigned)odone);
+      assert(max_out_samples+odone==odone0);
+      error = soxr_process(soxr, ibuf1, ilen, NULL, obuf+odone, olen, &odone1);
+      assert(!error);
+      odone += odone1;
+      ibuf1 = NULL, ilen = 0;
+      olen = min(100, AL(obuf)-odone);
+    }
+    assert(odone==odone0);
+
+    for (i=0; i<odone && out[i]==obuf[i]; ++i);
+    assert(i==odone);
+
+    soxr_clear(soxr);
+  }
+  soxr_delete(soxr);
+
+  return 0 * argc * !arg;
+}
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
new file mode 100644
index 0000000..ee8dd0b
--- /dev/null
+++ b/tests/CMakeLists.txt
@@ -0,0 +1,62 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${PROJECT_C_FLAGS}")
+link_libraries (${PROJECT_NAME} ${LIBM_LIBRARIES})
+
+file (GLOB SOURCES ${CMAKE_CURRENT_SOURCE_DIR}/*.c)
+foreach (fe ${SOURCES})
+  get_filename_component (f ${fe} NAME_WE)
+  add_executable (${f} ${fe})
+endforeach ()
+
+# Can't use c89 for this file:
+if (CMAKE_C_COMPILER_ID STREQUAL "GNU" OR CMAKE_C_COMPILER_ID STREQUAL "Clang")
+  set_property (SOURCE throughput APPEND_STRING PROPERTY COMPILE_FLAGS "-std=gnu89")
+endif ()
+
+set (sweep_to_freq 22050)
+set (leader 1)
+set (len 16)
+math (EXPR base_rate "${sweep_to_freq} + ${sweep_to_freq}")
+
+macro (add_vector r)
+  set (output ${CMAKE_CURRENT_BINARY_DIR}/ref-${r}.s32)
+  add_custom_command (OUTPUT ${output} DEPENDS vector-gen ${CMAKE_CURRENT_LIST_FILE}
+    COMMAND vector-gen ${r} ${leader} ${len} 0 ${sweep_to_freq} 1 ${output})
+  set (vectors ${output} ${vectors})
+endmacro ()
+
+macro (add_cmp_test irate orate bits)
+  set (name ${bits}-bit-perfect-${irate}-${orate})
+  add_test (NAME ${name} COMMAND ${CMAKE_COMMAND} -Dbits=${bits} -DBIN=${BIN}
+    -DEXAMPLES_BIN=${EXAMPLES_BIN} -DlenToSkip=${leader} -Dorate=${orate}
+    -Dirate=${irate} -Dlen=${len} -P ${CMAKE_CURRENT_SOURCE_DIR}/cmp-test.cmake)
+  add_vector (${irate})
+  add_vector (${orate})
+endmacro ()
+
+unset (test_bits)
+if (WITH_CR32 OR WITH_CR32S OR WITH_CR64 OR WITH_CR64S)
+  set (test_bits 20)
+endif ()
+if (WITH_CR64 OR WITH_CR64S)
+  set (test_bits ${test_bits} 28)
+endif ()
+
+set (rates 192000)
+if (WITH_HI_PREC_CLOCK)
+  set (rates ${rates} 65537)
+endif ()
+foreach (b ${test_bits})
+  foreach (r ${rates})
+    add_cmp_test (${base_rate} ${r} ${b})
+    add_cmp_test (${r} ${base_rate} ${b})
+  endforeach ()
+endforeach ()
+
+if (NOT CMAKE_CROSSCOMPILING)
+  add_custom_target (test-vectors ALL DEPENDS ${vectors})
+endif ()
+
+add_test (1-delay-clear ${BIN}1-delay-clear)
diff --git a/tests/README b/tests/README
new file mode 100644
index 0000000..44460d6
--- /dev/null
+++ b/tests/README
@@ -0,0 +1 @@
+A few tests on the pass-band performance; not a comprehensive test suite.
diff --git a/tests/bandwidth-test b/tests/bandwidth-test
new file mode 100755
index 0000000..4efdcc9
--- /dev/null
+++ b/tests/bandwidth-test
@@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Tests varying bandwidth.
+
+
+
+tool=./3-options-input-fn
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
+
+spec="spectrogram -z120 -Z-20 -w$w -ho"
+ext=f32; e=0
+rate1=48000
+rate2=44100
+
+for n in 1 2; do
+
+rate1n=`expr $rate1 / 2`
+
+#sox -r $rate1 -n 0.$ext synth 1s sq pad .03 .03  gain -1
+sox -r $rate1 -n 0.$ext synth 8 sin 0:$rate1n gain -1
+
+for pass in `seq 79 5 99`; do
+	f=bw1-$rate2-p`printf %02u $pass`-$w
+	$tool $rate1 $rate2 1 $e $e 4 0 $pass < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "bw-test pass:$pass stop:100"
+done
+
+for pass in `seq 79 5 99`; do
+	f=bw2-$rate2-p`printf %02u $pass`-$w
+	stop=`expr 200 - $pass`
+	$tool $rate1 $rate2 1 $e $e 4 0 $pass $stop < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "bw-test pass:$pass stop:$stop"
+done
+
+r=$rate1; rate1=$rate2; rate2=$r
+
+done
+
+rm 0.$ext
diff --git a/tests/cmp-test.cmake b/tests/cmp-test.cmake
new file mode 100644
index 0000000..a836322
--- /dev/null
+++ b/tests/cmp-test.cmake
@@ -0,0 +1,30 @@
+# SoX Resampler Library       Copyright (c) 2007-13 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+math (EXPR quality "43 + (${bits} - 13) / 4")
+set (ofile ${irate}-${orate}-${quality}.s32)
+#message (STATUS "Output file = [${ofile}]")
+
+execute_process(COMMAND ${EXAMPLES_BIN}3-options-input-fn ${irate} ${orate} 1 2 2 ${quality} a
+  INPUT_FILE ref-${irate}.s32
+  OUTPUT_FILE ${ofile}
+  ERROR_VARIABLE test_error
+  RESULT_VARIABLE test_result)
+
+if (test_result)
+  message (FATAL_ERROR "Resampling failure: ${test_error}")
+endif ()
+
+set (percentageToCheck 98)
+math (EXPR lenToCheck "${len} * ${percentageToCheck}")
+string (REGEX REPLACE "(..)$" ".\\1" lenToCheck "${lenToCheck}") # Divide by 100
+
+execute_process(COMMAND ${BIN}vector-cmp ref-${orate}.s32 ${ofile} ${orate} ${lenToSkip} ${lenToCheck} ${bits}
+  OUTPUT_VARIABLE test_output
+  RESULT_VARIABLE test_result)
+
+if (test_result)
+  message (FATAL_ERROR ${test_output})
+else ()
+  message (STATUS ${test_output})
+endif ()
diff --git a/tests/eg-test b/tests/eg-test
new file mode 100755
index 0000000..ccf4ce3
--- /dev/null
+++ b/tests/eg-test
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Exercises each example programme.
+
+
+
+len=8
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
+#vg="valgrind --leak-check=full --show-reachable=yes"
+
+
+
+# Exercise example 1:
+$vg ./1-single-block
+
+
+
+# Check that examples 2-4 can convert 96k<->44k1 and that results are same for each:
+ir=96000
+or=44100
+for i in 1 2; do
+  prev=""
+  sox -r $ir -n 0.f32 synth $len sin 0+`expr $ir / 2`
+  for f in `find . -type f -executable -name "[2-4]*"`; do
+    $vg $f $ir $or < 0.f32 > $f.f32
+    test x$prev != x && cmp $f.f32 $prev
+    prev=$f.f32
+  done
+  or=96000
+  ir=44100
+done
+rm *.f32
+
+
+
+# Exercise VR making sure that varied internal stage reconfigurations occur:
+variations=(slow-sweep fast-changing)
+signals=(sine-wave saw-tooth-wave)
+for n in 0 1 2 3; do
+  signal=${signals[`expr $n % 2 || true`]}
+  variation=${variations[`expr $n / 2 || true`]}
+  $vg ./5-variable-rate $n | sox -tf32 -r44100 -c1 - -n spectrogram -z130 -hw$w -o v$n-$w.png -X 50 -c "variation:$variation signal:$signal"
+  vg=""
+done
diff --git a/tests/io-test b/tests/io-test
new file mode 100755
index 0000000..608bc9a
--- /dev/null
+++ b/tests/io-test
@@ -0,0 +1,65 @@
+#!/usr/bin/env bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Tests IO
+
+
+
+ir=65537
+or=44100
+len=16
+f=1/32768
+g=32768:0
+tool=./3-options-input-fn
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
+
+types=(f32 f64 s32 s16)
+
+zs=(180 180 180 180 180 120 120 120 120)
+
+do_one() {
+  it=${types[$1]}; ot=${types[`expr $2 % 4 || true`]}
+  $tool $ir $or $c $1 $2 $3 < $c.$it > a.$ot
+  sox -r $or -c $c a.$ot -n spectrogram -X50 -hw$w -z${zs[$n]} -o io$c$n-$w.png -c "io-test i:$it o:$ot ($2) q:$3"
+  ./4-split-channels $ir $or $c $1 $2 $3 < $c.$it > b.$ot
+  [ $2 != 3 ] && cmp a.$ot b.$ot ||
+    test $(sox -mv-1 -r$or -c$c a.$ot -r$or -c$c b.$ot -n stats 2>&1 |grep Pk\ l|tr ' ' '\n'|grep '[0-9]'|uniq) = -84.29
+  rm [ab].$ot
+  n=`expr $n + 1`
+}
+
+test z$1 != z && j=$1 || j=1
+
+for c in `seq 1 $j`; do
+  for n in `seq 0 3`; do
+    sox -R -r $ir -n $c.${types[$n]} synth $len sin $f gain -.1
+  done
+
+  n=0
+  do_one 1 2 5
+  do_one 2 0 5
+  for m in `seq 0 3`; do do_one $m $m 5; done
+  do_one 3 2 3
+  do_one 0 3 3
+  do_one 0 11 3
+
+  f="$f sin $g"
+  g=0+32768
+done
+
+rm ?.[sf][0-9][0-9]
+
+
+
+# Check conversion between differing I/O types, but no rate-change:
+
+for i in 1 2 3; do
+  prev=""
+  sox -n -c $i 0.f32 synth $len gain -.1
+  $tool 1 1 $i 0 2 < 0.f32 | $tool 1 1 $i 2 0 > 1.f32
+  cmp [01].f32
+done
+rm *.f32
diff --git a/tests/large-ratio-test b/tests/large-ratio-test
new file mode 100755
index 0000000..540c5df
--- /dev/null
+++ b/tests/large-ratio-test
@@ -0,0 +1,22 @@
+#!/usr/bin/env bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Tests interpolating then decimating by the same, large ratio.
+
+tool=../examples/3-options-input-fn
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
+q=4
+test x$1 = x && ratio=1e5 || ratio=$1
+test x$2 = x && rate=8000 || rate=$2
+
+sox -r$rate -n 1.s32 synth 10 sin 0:`expr $rate / 2` vol .9375
+sync
+
+time { $tool 1 $ratio 1 2 1 $q a < 1.s32 | $tool $ratio 1 1 1 2 $q a > 2.s32;}
+
+sox -mv-1 -r$rate -c1 1.s32 -r$rate -c1 2.s32 -n spectrogram -hw$w -z150 -o lr-$w.png -c "large-ratio-test q:$q ratio:$ratio"
+
+rm [12].s32
diff --git a/tests/phase-test b/tests/phase-test
new file mode 100755
index 0000000..3c34268
--- /dev/null
+++ b/tests/phase-test
@@ -0,0 +1,39 @@
+#!/usr/bin/env bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Tests varying phase-response.
+
+tool=./3-options-input-fn
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
+spec="spectrogram -z160 -Z-20 -X 2000 -w$w -ho"
+ext=f32; e=0
+rate1=48000
+rate2=44100
+
+for n in 1 2; do
+	sox -r $rate1 -n 0.$ext synth 1s sq pad .03 .03  gain -1
+
+	# Test the following combinations:
+	names=(linear-phase intermediate-phase maximum-phase minimum-phase)
+	filters=(standard-filter steep-filter)
+
+	for q in `seq 0 7`; do
+		f=ph-$rate2-q$q-$w
+		name=${names[`expr $q % 4 || true`]}
+		filter=${filters[`expr $q / 4 || true`]}
+		$tool $rate1 $rate2 1 $e $e $q'6' < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "ph-test $filter $name"
+	done
+
+	# Test specific phase-response percentages:
+	for q in `seq 0 20 100`; do
+		f=ph-$rate2-p`printf %03u $q`-$w
+		$tool $rate1 $rate2 1 $e $e 46 0 0 0 $q < 0.$ext | sox -c1 -r$rate2 -t $ext - -n $spec $f.png -c "ph-test phase:${q}%"
+	done
+
+	r=$rate1; rate1=$rate2; rate2=$r
+done
+
+rm 0.$ext
diff --git a/tests/q-test b/tests/q-test
new file mode 100755
index 0000000..f274cb5
--- /dev/null
+++ b/tests/q-test
@@ -0,0 +1,73 @@
+#!/usr/bin/env bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Tests conversion qualities 0..7 & variable-rate.
+
+
+
+tool=./3-options-input-fn
+w=$(echo -e "`sox --ver |sed 's/.*SoX v//'` d\n14.4.1 k"|sort -Vr|head -1|sed 's/.* //')
+ext=f64; e=1
+c=1
+q1=0; q2=7
+rates=48000
+zs=(50 87 87 87 111 135 159 180 95)
+
+zz() {
+	echo "spectrogram -z${zs[$1]} -Z-30 -w$w -ho"
+}
+
+for rate0 in $rates; do
+
+rate1=$rate0
+rate2=44100
+
+for n in 1 2; do
+
+rate1n=`expr $rate1 / 2`
+
+
+
+# Convert sweep, for spectrogram:
+
+sox -r $rate1 -n -c $c 0.$ext synth 8 sin 0:$rate1n gain -1
+
+for q in `seq $q1 $q2`; do
+	f=qa-$rate1-$rate2-$q
+	$tool $rate1 $rate2 $c $e $e $q  0 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f-$w.png -c $f
+done
+q=8
+f=qa-$rate1-$rate2-v
+$tool $rate1 $rate2 $c $e $e 4 20 < 0.$ext | sox -c$c -r$rate2 -t $ext - -n $(zz $q) $f-$w.png -c $f
+
+
+
+# Convert impulse, for spectrogram:
+
+#: << :
+sox -r $rate1 -n 0.$ext synth 1s sq pad .03 .03  gain -1
+
+for q in `seq $q1 $q2`; do
+	f=qb-$rate1-$rate2-$q
+	$tool $rate1 $rate2 1 $e $e $q  0 < 0.$ext | sox -c1 -r$rate2 -t $ext - $f.wav
+done
+q=8
+f=qb-$rate1-$rate2-v
+$tool $rate1 $rate2 1 $e $e 4 20 < 0.$ext | sox -c1 -r$rate2 -t $ext - $f.wav
+
+# Combine impuse responses into multi-channel file (for inspection in Audacity):
+sox -M qb-$rate1-$rate2-?.wav q$rate1-$rate2.wav
+
+rm qb-$rate1-$rate2-?.wav
+:
+
+rate1=44100
+rate2=$rate0
+
+done
+done
+
+rm 0.$ext
diff --git a/tests/scripts b/tests/scripts
new file mode 100755
index 0000000..8b6023f
--- /dev/null
+++ b/tests/scripts
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+../../tests/bandwidth-test
+../../tests/eg-test
+../../tests/io-test 3
+../../tests/large-ratio-test
+../../tests/phase-test
+../../tests/q-test
+../../tests/time-test 1
+../../tests/time-test 2
diff --git a/tests/throughput-test b/tests/throughput-test
new file mode 100755
index 0000000..aef36f6
--- /dev/null
+++ b/tests/throughput-test
@@ -0,0 +1,11 @@
+#!/bin/sh
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-16 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+test -r throughput.exe && wine=wine
+
+test /$1 = / && list="`seq 0 3`" || list="$*"
+
+for n in $list; do $wine ./throughput 44.1 48 1 0 $n 4; done
diff --git a/tests/throughput-test.bat b/tests/throughput-test.bat
new file mode 100644
index 0000000..46b8f7d
--- /dev/null
+++ b/tests/throughput-test.bat
@@ -0,0 +1,5 @@
+@echo off
+rem SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+rem Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+for /L %%i in (0,1,3) DO throughput 44.1 48 1 0 %%i
diff --git a/tests/throughput.c b/tests/throughput.c
new file mode 100644
index 0000000..c52b885
--- /dev/null
+++ b/tests/throughput.c
@@ -0,0 +1,141 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+#include <soxr.h>
+#include "rint.h"
+#include "../examples/examples-common.h"
+
+#define k 1000
+
+#if defined _WIN32
+  #define WIN32_LEAN_AND_MEAN
+  #include <windows.h>
+  #define timerStart(msecs) LARGE_INTEGER start, stop, tmp; \
+      QueryPerformanceCounter(&start), QueryPerformanceFrequency(&tmp), \
+      stop.QuadPart = (msecs * tmp.QuadPart + k/2) / k
+  #define timerRunning() (QueryPerformanceCounter(&tmp), \
+      (tmp.QuadPart-start.QuadPart < stop.QuadPart))
+#else
+  #include <sys/time.h>
+  #if defined timeradd
+    #define K k
+    #define tv_frac tv_usec
+    #define timespec timeval
+    #define get_time(x) gettimeofday(x, NULL)
+  #else
+    #include <time.h>
+    #include <unistd.h>
+    #if defined _POSIX_TIMERS && _POSIX_TIMERS > 0
+      #define K (k*k)
+      #define tv_frac tv_nsec
+      #if defined _POSIX_MONOTONIC_CLOCK
+        #define get_time(x) clock_gettime(CLOCK_MONOTONIC, x)
+      #else
+        #define get_time(x) clock_gettime(CLOCK_REALTIME, x)
+      #endif
+    #else
+      #include <sys/timeb.h>
+      #define K 1
+      #define tv_frac millitm
+      #define tv_sec time
+      #define timespec timeb
+      #define get_time(x) ftime(x)
+    #endif
+  #endif
+
+  #define timerStart(msecs) struct timespec stop, tmp; get_time(&stop), \
+      stop.tv_frac += (msecs%k)*K, \
+      stop.tv_sec  += msecs/k + stop.tv_frac/(K*k), \
+      stop.tv_frac %= K*k
+  #define timerRunning() (get_time(&tmp), \
+      (tmp.tv_sec < stop.tv_sec || tmp.tv_frac < stop.tv_frac))
+#endif
+
+int main(int n, char const * arg[])
+{
+  char const *     const arg0 = n? --n, *arg++ : "", * engine = "";
+  double          const irate = n? --n, atof(*arg++) : 96000.;
+  double          const orate = n? --n, atof(*arg++) : 44100.;
+  unsigned        const chans = n? --n, (unsigned)atoi(*arg++) : 1;
+  soxr_datatype_t const itype = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned        const ospec = n? --n, (soxr_datatype_t)atoi(*arg++) : 0;
+  unsigned long const q_recipe= n? --n, strtoul(*arg++, 0, 16) : SOXR_HQ;
+  unsigned long const q_flags = n? --n, strtoul(*arg++, 0, 16) : 0;
+  double   const passband_end = n? --n, atof(*arg++) : 0;
+  double const stopband_begin = n? --n, atof(*arg++) : 0;
+  double const phase_response = n? --n, atof(*arg++) : -1;
+  int       const use_threads = n? --n, atoi(*arg++) : 1;
+  soxr_datatype_t const otype = ospec & 3;
+
+  soxr_quality_spec_t       q_spec = soxr_quality_spec(q_recipe, q_flags);
+  soxr_io_spec_t            io_spec = soxr_io_spec(itype, otype);
+  soxr_runtime_spec_t const runtime_spec = soxr_runtime_spec(!use_threads);
+
+  /* Allocate resampling input and output buffers in proportion to the input
+   * and output rates: */
+  #define buf_total_len 15000  /* In samples per channel. */
+  size_t const osize = soxr_datatype_size(otype) * chans;
+  size_t const isize = soxr_datatype_size(itype) * chans;
+  size_t const olen0= (size_t)(orate * buf_total_len / (irate + orate) + .5);
+  size_t const olen = min(max(olen0, 1), buf_total_len - 1);
+  size_t const ilen = buf_total_len - olen;
+  void * const obuf = malloc(osize * olen);
+  void * const ibuf = malloc(isize * ilen);
+
+  size_t odone = 0, clips = 0, omax = 0, i;
+  soxr_error_t error;
+  soxr_t soxr;
+  int32_t seed = 0;
+  char const * e = getenv("SOXR_THROUGHPUT_GAIN");
+  double gain = e? atof(e) : .5;
+
+  /* Overrides (if given): */
+  if (passband_end   > 0) q_spec.passband_end   = passband_end / 100;
+  if (stopband_begin > 0) q_spec.stopband_begin = stopband_begin / 100;
+  if (phase_response >=0) q_spec.phase_response = phase_response;
+  io_spec.flags = ospec & ~7u;
+
+  /* Create a stream resampler: */
+  soxr = soxr_create(
+      irate, orate, chans,         /* Input rate, output rate, # of channels. */
+      &error,                         /* To report any error during creation. */
+      &io_spec, &q_spec, &runtime_spec);
+
+#define ranqd1(x) ((x) = 1664525 * (x) + 1013904223) /* int32_t x */
+#define dranqd1(x) (ranqd1(x) * (1. / (65536. * 32768.))) /* [-1,1) */
+#define RAND (dranqd1(seed) * gain)
+#define DURATION_MSECS 125
+#define NUM_ATTEMPTS 8
+
+  if (!error) {                         /* If all is well, run the resampler: */
+    engine = soxr_engine(soxr);
+    switch (itype & 3) {
+      case 0: for (i=0;i<ilen*chans; ((float   *)ibuf)[i]=(float  )RAND, ++i); break;
+      case 1: for (i=0;i<ilen*chans; ((double  *)ibuf)[i]=(double )RAND, ++i); break;
+      case 2: for (i=0;i<ilen*chans; ((int32_t *)ibuf)[i]=rint32(65536.*32768*RAND), ++i); break;
+      case 3: for (i=0;i<ilen*chans; ((int16_t *)ibuf)[i]=rint16(    1.*32768*RAND), ++i); break;
+    }
+                                                       /* Resample in blocks: */
+    for (i=0; i<NUM_ATTEMPTS; ++i) {
+      size_t itotal = 0, ototal = 0;
+      timerStart(DURATION_MSECS);
+      do {
+        size_t const ilen1 = odone < olen? ilen : 0;
+        error = soxr_process(soxr, ibuf, ilen1, NULL, obuf, olen, &odone);
+        itotal += ilen1;
+        ototal += odone;
+      } while (!error && timerRunning());
+      omax = max(omax, ototal);
+    }
+  }
+                                                                  /* Tidy up: */
+  clips = *soxr_num_clips(soxr);     /* Can occur only with integer output. */
+  soxr_delete(soxr);
+  free(obuf), free(ibuf);
+                                                              /* Diagnostics: */
+  fprintf(stderr, "%-26s %s; %lu clips; I/O: %s (%-5s) %.2f Ms/s\n",
+      arg0, soxr_strerror(error), (long unsigned)clips,
+      ferror(stdin) || ferror(stdout)? strerror(errno) : "no error", engine,
+      1e-6 * k / DURATION_MSECS * chans * (double)omax);
+  return !!error;
+}
diff --git a/tests/time-test b/tests/time-test
new file mode 100755
index 0000000..f253717
--- /dev/null
+++ b/tests/time-test
@@ -0,0 +1,36 @@
+#!/usr/bin/env bash
+set -e
+
+# SoX Resampler Library       Copyright (c) 2007-15 robs@users.sourceforge.net
+# Licence for this file: LGPL v2.1                  See LICENCE for details.
+
+# Tests rate conversion time for qualities 0..7 & variable-rate.
+
+tool=./3-options-input-fn
+ext=f32; e=0
+test z"$1" != z && c="$1" || c=2
+test z"$2" != z && qs="$2" || qs="`seq 0 7` v"
+rates="48000 77773 96000"
+time=`which time`
+BASE=`basename $0`
+TIME=/tmp/$BASE-time-$$
+ERR=/tmp/$BASE-err-$$
+uname -m |grep -q ^arm && len=60 || len=600
+export OMP_NUM_THREADS=2
+
+for rate0 in $rates; do
+	rate1=44100
+	rate2=$rate0
+	for n in 1 2; do
+		sox -R -r $rate1 -n -c $c 0.$ext synth $len noise; sync
+		for q in $qs; do
+			test $q = v && Q="4 20" || Q=$q
+			$time -f %e -o $TIME $tool $rate1 $rate2 $c $e $e $Q < 0.$ext > /dev/null 2> $ERR
+			echo $rate1 '-->' $rate2 c=$c q=$q t=`cat $TIME` `cat $ERR | sed 's/.*(/(/'`
+		done
+		rate1=$rate0
+		rate2=44100
+	done
+done
+
+rm 0.$ext
diff --git a/tests/vector-cmp.c b/tests/vector-cmp.c
new file mode 100644
index 0000000..f90cc7f
--- /dev/null
+++ b/tests/vector-cmp.c
@@ -0,0 +1,56 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Utility used to help test the library; not for general consumption.
+ *
+ * Measure the peak bit difference between two files.  */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include "../src/rint.h"
+#include "../examples/examples-common.h"
+
+#define TYPE 0 /* As vector-gen */
+
+#if TYPE
+  #define sample_t double
+  #define N 50
+  #define DIFF(s1,s2) abs(rint32((s1-s2)*ldexp(1,N-1)))
+#else
+  #define sample_t int32_t
+  #define N 32
+  #define DIFF(s1,s2) abs((int)(s1-s2))
+#endif
+
+int main(int argc, char const * arg[])
+{
+  int     two      = !!arg[2][0];
+  FILE    * f1 = fopen(arg[1], "rb"), * f2 = two? fopen(arg[2], "rb") : 0;
+  double  rate     = atof (arg[3]), /* Sample-rate */
+          skip_len = atof (arg[4]), /* Skip length in seconds */
+          len      = atof (arg[5]), /* Compare length in seconds */ r;
+  int i = 0, count = rint32(rate * len), max = 0, diff;
+  sample_t s1, s2;
+
+  fseek(f1, rint32(rate * skip_len) * (int)sizeof(s1), SEEK_CUR);
+  if (two) {
+    fseek(f2, rint32(rate * skip_len) * (int)sizeof(s2), SEEK_CUR);
+    for (; i < count &&
+        fread(&s1, sizeof(s1), 1, f1) &&
+        fread(&s2, sizeof(s2), 1, f2); ++i) {
+      diff = DIFF(s1, s2);
+      max = max(max, diff);
+    }
+  }
+  else for (; i < count && fread(&s1, sizeof(s1), 1, f1); ++i) {
+    diff = DIFF(s1, 0);
+    max = max(max, diff);
+  }
+
+  if (i != count) {
+    fprintf(stderr, "incorrect file length\n");
+    return 1;
+  }
+  printf("%f\n", r = N-log(max)/log(2));
+  return argc>6? r<atof(arg[6]) : 0;
+}
diff --git a/tests/vector-gen.c b/tests/vector-gen.c
new file mode 100644
index 0000000..0446ec9
--- /dev/null
+++ b/tests/vector-gen.c
@@ -0,0 +1,61 @@
+/* SoX Resampler Library      Copyright (c) 2007-16 robs@users.sourceforge.net
+ * Licence for this file: LGPL v2.1                  See LICENCE for details. */
+
+/* Utility used to help test the library; not for general consumption.
+ *
+ * Generate a swept sine to a file, with `lead-in' section.  */
+
+#define TYPE 0 /* calc/store: 0:flt64/int32 1:flt80/flt64 2:flt128/flt64 */
+
+#if TYPE > 1
+  #include <quadmath.h>
+#endif
+
+#include "math-wrap.h"
+#include <stdlib.h>
+#include <stdio.h>
+
+#if TYPE
+  #if TYPE > 1
+    #define modf modfq
+    #define cos cosq
+    #define sin sinq
+    #define PI M_PIq
+    #define real __float128
+    #define atof(x) strtoflt128(x, 0)
+  #else
+    #define modf modfl
+    #define cos cosl
+    #define sin sinl
+    #define PI M_PIl
+    #define real long double
+  #endif
+  #define MULT 1
+  #define OUT(d) double output = d
+#else
+  #define PI M_PI
+  #define real double
+  #include "rint.h"
+  #define MULT (32768. * 65536 - 1/scale)
+  #define OUT(d) int32_t output = rint32(d)
+#endif
+
+int main(int argc, char const * argv[])
+{
+  real rate         = atof(argv[1]), /* Rate for this vector */
+       lead_in_len  = atof(argv[2]), /* Lead-in length in seconds */
+       len          = atof(argv[3]), /* Sweep length (excl. lead_in_len) */
+       f1           = atof(argv[4]),
+       f2           = atof(argv[5]),
+       scale        = atof(argv[6]), /* For headroom */
+       n1 = rate * -lead_in_len,
+       m = (f2 - f1) / (rate * len * 2), dummy;
+  FILE * file = fopen(argv[7], "wb");
+  int i = (int)n1, err = !file || i != n1;
+  for (; !err && i < (int)(rate*(len+lead_in_len)+.5); ++i) {
+    real d = sin(2 * PI * modf((f1 + i * m) * i / rate, &dummy));
+    OUT((double)(scale * MULT * d));
+    err = fwrite(&output, sizeof(output), 1, file) != 1;
+  }
+  return err |!argc;
+}
-- 
2.30.2